bitkeeper revision 1.1247 (42386d3dpoPovazcjxeV5wadySvQoA)
authorrneugeba@wyvis.research.intel-research.net <rneugeba@wyvis.research.intel-research.net>
Wed, 16 Mar 2005 17:30:37 +0000 (17:30 +0000)
committerrneugeba@wyvis.research.intel-research.net <rneugeba@wyvis.research.intel-research.net>
Wed, 16 Mar 2005 17:30:37 +0000 (17:30 +0000)
michael's initial shadow code

Signed-off-by: michael.fetterman@cl.cam.ac.uk
19 files changed:
.rootkeys
xen/arch/x86/audit.c [new file with mode: 0644]
xen/arch/x86/domain.c
xen/arch/x86/domain_build.c
xen/arch/x86/mm.c
xen/arch/x86/shadow.c
xen/arch/x86/traps.c
xen/arch/x86/vmx.c
xen/arch/x86/x86_32/domain_page.c
xen/common/dom_mem_ops.c
xen/common/page_alloc.c
xen/common/schedule.c
xen/include/asm-x86/domain.h
xen/include/asm-x86/mm.h
xen/include/asm-x86/page.h
xen/include/asm-x86/shadow.h
xen/include/asm-x86/x86_32/page.h
xen/include/xen/domain.h
xen/include/xen/perfc_defn.h

index 08be4b86908cc0328981f1ab003622946e01e091..74540dc3eaea7bb451d4dfd40099aaaea21b0257 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 3ddb79bcBQF85CfLS4i1WGZ4oLLaCA xen/arch/x86/Rules.mk
 3e5636e5FAYZ5_vQnmgwFJfSdmO5Mw xen/arch/x86/acpi.c
 3ddb79bcsjinG9k1KcvbVBuas1R2dA xen/arch/x86/apic.c
+42386d3bKw0QftYe-cDL6_4WiATRTw xen/arch/x86/audit.c
 3ddb79c4yGZ7_22QAFFwPzqP4NSHwA xen/arch/x86/boot/mkelf32.c
 3ddb79bcSC_LvnmFlX-T5iTgaR0SKg xen/arch/x86/boot/x86_32.S
 40e42bdbNu4MjI750THP_8J1S-Sa0g xen/arch/x86/boot/x86_64.S
diff --git a/xen/arch/x86/audit.c b/xen/arch/x86/audit.c
new file mode 100644 (file)
index 0000000..1c5b89f
--- /dev/null
@@ -0,0 +1,817 @@
+/******************************************************************************
+ * arch/x86/audit.c
+ * 
+ * Copyright (c) 2002-2005 K A Fraser
+ * Copyright (c) 2004 Christian Limpach
+ * Copyright (c) 2005 Michael A Fetterman
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/lib.h>
+#include <xen/mm.h>
+//#include <xen/sched.h>
+//#include <xen/errno.h>
+#include <xen/perfc.h>
+//#include <xen/irq.h>
+//#include <xen/softirq.h>
+#include <asm/shadow.h>
+#include <asm/page.h>
+#include <asm/flushtlb.h>
+//#include <asm/io.h>
+//#include <asm/uaccess.h>
+//#include <asm/domain_page.h>
+//#include <asm/ldt.h>
+
+// XXX SMP bug -- these should not be statics...
+//
+static int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
+static int l1, l2, oos_count, page_count;
+
+#define FILE_AND_LINE 1
+
+#if FILE_AND_LINE
+#define adjust(_p, _a) _adjust((_p), (_a), __FILE__, __LINE__)
+#define ADJUST_EXTRA_ARGS ,const char *file, int line
+#define APRINTK(_f, _a...) printk(_f " %s:%d\n", ## _a, file, line)
+#else
+#define adjust _adjust
+#define ADJUST_EXTRA_ARGS
+#define APRINTK(_f, _a...) printk(_f "\n", ##_a)
+#endif
+
+int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
+{
+    int errors = 0;
+    int shadow_enabled = shadow_mode_enabled(d) ? 1 : 0;
+
+    void _adjust(struct pfn_info *page, int adjtype ADJUST_EXTRA_ARGS)
+    {
+        if ( adjtype )
+        {
+            // adjust the type count
+            //
+            int tcount = page->u.inuse.type_info & PGT_count_mask;
+            tcount += dir;
+            ttot++;
+
+            if ( page_get_owner(page) == NULL )
+            {
+                APRINTK("adjust(mfn=%p, dir=%d, adjtype=%d) owner=NULL",
+                        page_to_pfn(page), dir, adjtype, file, line);
+                errors++;
+            }
+
+            if ( tcount < 0 )
+            {
+                APRINTK("Audit %d: type count went below zero mfn=%x t=%x ot=%x",
+                        d->id, page-frame_table,
+                        page->u.inuse.type_info,
+                        page->tlbflush_timestamp);
+                errors++;
+            }
+            else if ( (tcount & ~PGT_count_mask) != 0 )
+            {
+                APRINTK("Audit %d: type count overflowed mfn=%x t=%x ot=%x",
+                        d->id, page-frame_table,
+                        page->u.inuse.type_info,
+                        page->tlbflush_timestamp);
+                errors++;
+            }
+            else
+                page->u.inuse.type_info += dir;
+        }
+
+        // adjust the general count
+        //
+        int count = page->count_info & PGC_count_mask;
+        count += dir;
+        ctot++;
+
+        if ( count < 0 )
+        {
+            APRINTK("Audit %d: general count went below zero pfn=%x t=%x ot=%x",
+                    d->id, page-frame_table,
+                    page->u.inuse.type_info,
+                    page->tlbflush_timestamp);
+            errors++;
+        }
+        else if ( (count & ~PGT_count_mask) != 0 )
+        {
+            APRINTK("Audit %d: general count overflowed pfn=%x t=%x ot=%x",
+                    d->id, page-frame_table,
+                    page->u.inuse.type_info,
+                    page->tlbflush_timestamp);
+            errors++;
+        }
+        else
+            page->count_info += dir;
+    }
+
+    void adjust_l2_page(unsigned long mfn, int adjtype)
+    {
+        unsigned long *pt = map_domain_mem(mfn << PAGE_SHIFT);
+        int i, limit;
+
+        if ( shadow_mode_external(d) )
+            limit = L2_PAGETABLE_ENTRIES;
+        else
+            limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+
+        for ( i = 0; i < limit; i++ )
+        {
+            if ( pt[i] & _PAGE_PRESENT )
+            {
+                unsigned long l1mfn = pt[i] >> PAGE_SHIFT;
+                struct pfn_info *l1page = pfn_to_page(l1mfn);
+
+                if ( noisy )
+                {
+                    if ( shadow_enabled )
+                    {
+                        if ( page_get_owner(l1page) != NULL )
+                        {
+                            printk("L2: Bizarre shadow L1 page mfn=%p "
+                                   "belonging to a domain %p (id=%d)\n",
+                                   l1mfn,
+                                   page_get_owner(l1page),
+                                   page_get_owner(l1page)->id);
+                            errors++;
+                            continue;
+                        }
+                    }
+                    else
+                    {
+                        if ( page_get_owner(l1page) != d )
+                        {
+                            printk("L2: Skip bizarre L1 page mfn=%p "
+                                   "belonging to other dom %p (id=%d)\n",
+                                   l1mfn,
+                                   page_get_owner(l1page),
+                                   page_get_owner(l1page)->id);
+                            errors++;
+                            continue;
+                        }
+
+                        u32 page_type = l1page->u.inuse.type_info & PGT_type_mask;
+
+                        if ( page_type == PGT_l2_page_table )
+                        {
+                            printk("Audit %d: [%x] Found %s Linear PT "
+                                   "t=%x mfn=%p\n",
+                                   d->id, i, (l1mfn==mfn) ? "Self" : "Other",
+                                   l1page->u.inuse.type_info, l1mfn);
+                        }
+                        else if ( page_type != PGT_l1_page_table )
+                        {
+                            printk("Audit %d: [L2 mfn=%p i=%x] "
+                                   "Expected L1 t=%x mfn=%p\n",
+                                   d->id, mfn, i,
+                                   l1page->u.inuse.type_info, l1mfn);
+                            errors++;
+                        }
+                    }
+                }
+
+                adjust(l1page, adjtype);
+            }
+        }
+
+        unmap_domain_mem(pt);
+    }
+
+    void adjust_l1_page(unsigned long l1mfn)
+    {
+        unsigned long *pt = map_domain_mem(l1mfn << PAGE_SHIFT);
+        int i;
+
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            if ( pt[i] & _PAGE_PRESENT )
+            {
+                unsigned long gmfn = pt[i] >> PAGE_SHIFT;
+                struct pfn_info *gpage = pfn_to_page(gmfn);
+
+                if ( gmfn < 0x100 )
+                {
+                    lowmem_mappings++;
+                    continue;
+                }
+
+                if ( gmfn > max_page )
+                {
+                    io_mappings++;
+                    continue;
+                }
+
+                if ( noisy )
+                {
+                    if ( pt[i] & _PAGE_RW )
+                    {
+                        // If it's not a writable page, complain.
+                        //
+                        if ( !((gpage->u.inuse.type_info & PGT_type_mask) ==
+                               PGT_writable_page) )
+                        {
+                            printk("Audit %d: [l1mfn=%p, i=%x] Illegal RW "
+                                   "t=%x mfn=%p\n",
+                                   d->id, l1mfn, i,
+                                   gpage->u.inuse.type_info, gmfn);
+                            errors++;
+                        }
+
+                        if ( shadow_enabled &&
+                             page_is_page_table(gpage) &&
+                             ! page_out_of_sync(gpage) )
+                        {
+                            printk("Audit %d: [l1mfn=%p, i=%x] Illegal RW of "
+                                   "page table gmfn=%p\n",
+                                   d->id, l1mfn, i, gmfn);
+                            errors++;
+                        }
+                    }
+
+                    if ( page_get_owner(gpage) != d )
+                    {
+                        printk("Audit %d: [l1mfn=%p,i=%x] Skip foreign page "
+                               "dom=%p (id=%d) mfn=%p c=%08x t=%08x\n",
+                               d->id, l1mfn, i,
+                               page_get_owner(gpage),
+                               page_get_owner(gpage)->id,
+                               gmfn,
+                               gpage->count_info,
+                               gpage->u.inuse.type_info);
+                        continue;
+                    }
+                }
+
+                adjust(gpage, (pt[i] & _PAGE_RW) ? 1 : 0);
+            }
+        }
+
+        unmap_domain_mem(pt);
+    }
+
+    void adjust_shadow_tables()
+    {
+        struct shadow_status *a;
+        unsigned long smfn, gmfn;
+        struct pfn_info *page;
+        int i;
+
+        for ( i = 0; i < shadow_ht_buckets; i++ )
+        {
+            a = &d->arch.shadow_ht[i];
+            while ( a && a->gpfn_and_flags )
+            {
+                gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
+                smfn = a->smfn;
+                page = &frame_table[smfn];
+
+                adjust(pfn_to_page(gmfn), 0);
+
+                switch ( a->gpfn_and_flags & PGT_type_mask ) {
+                case PGT_snapshot:
+                    break;
+                case PGT_l1_shadow:
+                case PGT_hl2_shadow:
+                    adjust_l1_page(smfn);
+                    if ( page->u.inuse.type_info & PGT_pinned )
+                        adjust(page, 0);
+                    break;
+                case PGT_l2_shadow:
+                    adjust_l2_page(smfn, 0);
+                    if ( page->u.inuse.type_info & PGT_pinned )
+                        adjust(page, 0);
+                    break;
+                default:
+                    BUG();
+                    break;
+                }
+
+                a = a->next;
+            }
+        }
+    }
+
+    void adjust_oos_list()
+    {
+        struct out_of_sync_entry *oos;
+
+        if ( (oos = d->arch.out_of_sync) )
+            ASSERT(shadow_enabled);
+
+        while ( oos )
+        {
+            adjust(pfn_to_page(oos->gmfn), 0);
+
+            // Only use entries that have low bits clear...
+            //
+            if ( !(oos->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
+                adjust(pfn_to_page(oos->writable_pl1e >> PAGE_SHIFT), 0);
+
+            oos = oos->next;
+            oos_count++;
+        }
+    }
+
+    void adjust_for_pgtbase()
+    {
+        struct exec_domain *ed;
+
+        for_each_exec_domain(d, ed)
+            {
+                if ( !shadow_enabled )
+                {
+                    if ( pagetable_val(ed->arch.guest_table) )
+                        adjust(&frame_table[pagetable_val(ed->arch.guest_table)
+                                            >> PAGE_SHIFT], 1);
+                }
+                else
+                {
+                    if ( pagetable_val(ed->arch.guest_table) )
+                        adjust(&frame_table[pagetable_val(ed->arch.guest_table)
+                                            >> PAGE_SHIFT], 0);
+                    if ( pagetable_val(ed->arch.shadow_table) )
+                        adjust(&frame_table[pagetable_val(ed->arch.shadow_table)
+                                            >> PAGE_SHIFT], 0);
+                }
+            }
+    }
+
+    void adjust_guest_pages()
+    {
+        struct list_head *list_ent = d->page_list.next;
+        struct pfn_info *page;
+        unsigned long mfn;
+
+        while ( list_ent != &d->page_list )
+        {
+            u32 page_type;
+
+            page = list_entry(list_ent, struct pfn_info, list);
+            mfn = page_to_pfn(page);
+            page_type = page->u.inuse.type_info & PGT_type_mask;
+
+            if ( page_get_owner(page) != d )
+                BUG();
+
+            page_count++;
+
+            switch ( page_type )
+            {
+            case PGT_l2_page_table:
+                l2++;
+
+                if ( noisy )
+                {
+                    if ( shadow_enabled )
+                    {
+                        printk("Audit %d: found an L2 guest page "
+                               "mfn=%p t=%08x c=%08x while in shadow mode\n",
+                               mfn, page->u.inuse.type_info, page->count_info);
+                        errors++;
+                    }
+
+                    if ( (page->u.inuse.type_info & PGT_validated) !=
+                         PGT_validated )
+                    {
+                        printk("Audit %d: L2 mfn=%p not validated %p\n",
+                               d->id, mfn, page->u.inuse.type_info);
+                        errors++;
+                    }
+
+                    if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
+                    {
+                        printk("Audit %d: L2 mfn=%p not pinned t=%p\n",
+                               d->id, mfn, page->u.inuse.type_info);
+                        errors++;
+                    }
+                }
+
+                if ( page->u.inuse.type_info & PGT_pinned )
+                    adjust(page, 1);
+
+                if ( page->u.inuse.type_info & PGT_validated )
+                    adjust_l2_page(mfn, 1);
+
+                break;
+
+            case PGT_l1_page_table:
+                l1++;
+
+                if ( noisy )
+                {
+                    if ( shadow_enabled )
+                    {
+                        printk("found an L1 guest page mfn=%p t=%08x c=%08x while in shadow mode\n",
+                               mfn, page->u.inuse.type_info, page->count_info);
+                        errors++;
+                    }
+
+                    if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
+                    {
+                        printk("Audit %d: L1 not validated mfn=%p t=%p\n",
+                               d->id, mfn, page->u.inuse.type_info);
+                        errors++;
+                    }
+
+                    if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
+                    {
+                        if ( !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) )
+                        {
+                            printk("Audit %d: L1 mfn=%p not pinned t=%p\n",
+                                   d->id, mfn, page->u.inuse.type_info);
+                            errors++;
+                        }
+                    }
+                }
+                
+                if ( page->u.inuse.type_info & PGT_pinned )
+                    adjust(page, 1);
+
+                if ( page->u.inuse.type_info & PGT_validated )
+                    adjust_l1_page(mfn);
+
+                break;
+
+            case PGT_gdt_page:
+                ASSERT( !page_out_of_sync(page) );
+                adjust(page, 1);
+                break;
+
+            case PGT_ldt_page:
+                ASSERT( !page_out_of_sync(page) );
+                adjust(page, 1);
+                break;
+
+            case PGT_writable_page:
+                if ( shadow_enabled )
+                {
+                    // In shadow mode, writable pages can get pinned by
+                    // paravirtualized guests that think they are pinning
+                    // their L1s and/or L2s.
+                    //
+                    if ( page->u.inuse.type_info & PGT_pinned )
+                        adjust(page, 1);
+                }
+            }
+
+            list_ent = page->list.next;
+        }
+    }
+
+    adjust_for_pgtbase();
+
+    adjust_guest_pages();
+
+    if ( shadow_enabled )
+    {
+        adjust_oos_list();
+        adjust_shadow_tables();
+    }
+
+    return errors;
+}
+
+
+#ifndef NDEBUG
+
+void _audit_domain(struct domain *d, int flags, const char *file, int line)
+{
+    void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn,
+                             unsigned long mfn)
+    {
+        struct pfn_info *page = &frame_table[mfn];
+        unsigned long *pt = map_domain_mem(mfn);
+        int i;
+
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            if ( (pt[i] & _PAGE_PRESENT) && ((pt[i] >> PAGE_SHIFT) == xmfn) )
+                printk("     found dom=%d mfn=%p t=%x c=%x pt[i=%x]=%p\n",
+                       d->id, mfn, page->u.inuse.type_info,
+                       page->count_info, i, pt[i]);
+        }
+
+        unmap_domain_mem(pt);           
+    }
+
+    void scan_for_pfn(struct domain *d, unsigned long xmfn)
+    {
+        if ( !shadow_mode_enabled(d) )
+        {
+            struct list_head *list_ent = d->page_list.next;
+            struct pfn_info *page;
+
+            while ( list_ent != &d->page_list )
+            {
+                page = list_entry(list_ent, struct pfn_info, list);
+
+                switch ( page->u.inuse.type_info & PGT_type_mask )
+                {
+                case PGT_l1_page_table:
+                case PGT_l2_page_table:
+                    scan_for_pfn_in_mfn(d, xmfn, page_to_pfn(page));
+                    break;
+                default:
+                    break;
+                }
+
+                list_ent = page->list.next;
+            }
+        }
+        else
+        {
+            struct shadow_status *a;
+            int i;
+            
+            for ( i = 0; i < shadow_ht_buckets; i++ )
+            {
+                a = &d->arch.shadow_ht[i];
+                while ( a && a->gpfn_and_flags )
+                {
+                    switch ( a->gpfn_and_flags & PGT_type_mask )
+                    {
+                    case PGT_l1_shadow:
+                    case PGT_l2_shadow:
+                    case PGT_hl2_shadow:
+                        scan_for_pfn_in_mfn(d, xmfn, a->smfn);
+                        break;
+                    case PGT_snapshot:
+                        break;
+                    default:
+                        BUG();
+                        break;
+                    }
+                    a = a->next;
+                }
+            }
+        }
+    }
+
+    void scan_for_pfn_remote(unsigned long xmfn)
+    {
+        struct domain *e;
+        for_each_domain ( e )
+            scan_for_pfn( e, xmfn );
+    } 
+
+    unsigned long mfn;
+    struct list_head *list_ent;
+    struct pfn_info *page;
+    int errors = 0;
+
+    if ( d != current->domain )
+        domain_pause(d);
+    synchronise_pagetables(~0UL);
+
+    // Maybe we should just be using BIGLOCK?
+    //
+    if ( !(flags & AUDIT_ALREADY_LOCKED) )
+        shadow_lock(d);
+
+    spin_lock(&d->page_alloc_lock);
+
+    /* PHASE 0 */
+
+    list_ent = d->page_list.next;
+    while ( list_ent != &d->page_list )
+    {
+        u32 page_type;
+
+        page = list_entry(list_ent, struct pfn_info, list);
+        mfn = page_to_pfn(page);
+        page_type = page->u.inuse.type_info & PGT_type_mask;
+
+        if ( page_get_owner(page) != d )
+            BUG();
+
+        if ( (page->u.inuse.type_info & PGT_count_mask) >
+             (page->count_info & PGC_count_mask) )
+        {
+            printk("taf(%08x) > caf(%08x) mfn=%p\n",
+                   page->u.inuse.type_info, page->count_info, mfn);
+            errors++;
+        }
+
+        if ( shadow_mode_enabled(d) &&
+             (page_type == PGT_writable_page) &&
+             !(page->u.inuse.type_info & PGT_validated) )
+        {
+            printk("shadow mode writable page not validated mfn=%p t=%08x c=%08x\n",
+                   mfn, page->u.inuse.type_info, page->count_info);
+            errors++;
+        }
+#if 0   /* SYSV shared memory pages plus writeable files. */
+        if ( page_type == PGT_writable_page && 
+             (page->u.inuse.type_info & PGT_count_mask) > 1 )
+        {
+            printk("writeable page with type count >1: mfn=%lx t=%x c=%x\n",
+                  mfn,
+                  page->u.inuse.type_info,
+                  page->count_info );
+            errors++;
+            scan_for_pfn_remote(mfn);
+        }
+#endif
+
+        if ( page_type == PGT_none && 
+             (page->u.inuse.type_info & PGT_count_mask) > 0 )
+        {
+            printk("normal page with type count >0: mfn=%lx t=%x c=%x\n",
+                  mfn,
+                  page->u.inuse.type_info,
+                  page->count_info );
+            errors++;
+        }
+
+        if ( page_out_of_sync(page) )
+        {
+            if ( !page_is_page_table(page) )
+            {
+                printk("out of sync page mfn=%p is not a page table\n", mfn);
+                errors++;
+            }
+            unsigned long pfn = __mfn_to_gpfn(d, mfn);
+            if ( !__shadow_status(d, pfn, PGT_snapshot) )
+            {
+                printk("out of sync page mfn=%p doesn't have a snapshot\n");
+                errors++;
+            }
+            if ( page_type != PGT_writable_page )
+            {
+                printk("out of sync page mfn=%p has strange type t=%08x c=%08x\n",
+                       mfn, page->u.inuse.type_info, page->count_info);
+                errors++;
+            }
+        }
+
+        /* Use tlbflush_timestamp to store original type_info. */
+        page->tlbflush_timestamp = page->u.inuse.type_info;
+
+        list_ent = page->list.next;
+    }
+
+    /* PHASE 1 */
+    io_mappings = lowmem_mappings = 0;
+
+    errors += audit_adjust_pgtables(d, -1, 1);
+
+    if ( !(flags & AUDIT_QUIET) &&
+         ((io_mappings > 0) || (lowmem_mappings > 0)) )
+        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
+               d->id, lowmem_mappings, io_mappings);
+
+    /* PHASE 2 */
+
+    list_ent = d->page_list.next;
+    while ( list_ent != &d->page_list )
+    {
+        page = list_entry(list_ent, struct pfn_info, list);
+        mfn = page_to_pfn(page);
+
+        switch ( page->u.inuse.type_info & PGT_type_mask)
+        {
+        case PGT_l1_page_table:
+        case PGT_l2_page_table:
+            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
+            {
+                printk("Audit %d: type count!=0 t=%x ot=%x c=%x mfn=%lx\n",
+                       d->id, page->u.inuse.type_info, 
+                       page->tlbflush_timestamp,
+                       page->count_info, mfn);
+                errors++;
+                scan_for_pfn_remote(mfn);
+            }
+            break;
+        case PGT_none:
+        case PGT_writable_page:
+        case PGT_gdt_page:
+        case PGT_ldt_page:
+            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
+            {
+                printk("Audit %d: type count!=0 t=%x ot=%x c=%x mfn=%lx\n",
+                       d->id, page->u.inuse.type_info, 
+                       page->tlbflush_timestamp,
+                       page->count_info, mfn);
+                errors++;
+            }
+            break;
+        default:
+            BUG(); // XXX fix me...
+        }
+        
+        if ( (page->count_info & PGC_count_mask) != 1 )
+        {
+            printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x mfn=%lx\n",
+                   d->id,
+                   page->count_info,
+                   page->u.inuse.type_info, 
+                   page->tlbflush_timestamp, mfn );
+            errors++;
+            scan_for_pfn_remote(mfn);
+        }
+
+        list_ent = page->list.next;
+    }
+
+    if ( shadow_mode_enabled(d) )
+    {
+        struct shadow_status *a;
+        struct pfn_info *page;
+        u32 page_type;
+        int i;
+
+        for ( i = 0; i < shadow_ht_buckets; i++ )
+        {
+            a = &d->arch.shadow_ht[i];
+            while ( a && a->gpfn_and_flags )
+            {
+                page = pfn_to_page(a->smfn);
+                page_type = a->gpfn_and_flags & PGT_type_mask;
+
+                switch ( page_type ) {
+                case PGT_snapshot:
+                    // XXX -- what should we check here?
+                    break;
+                case PGT_l1_shadow:
+                case PGT_l2_shadow:
+                    if ( ((page->u.inuse.type_info & PGT_type_mask) != page_type ) ||
+                         (page->count_info != 0) )
+                    {
+                        printk("Audit %d: shadow page counts wrong mfn=%p t=%x c=%x\n",
+                               d->id, page_to_pfn(page),
+                               page->u.inuse.type_info,
+                               page->count_info);
+                        errors++;
+                    }
+                    break;
+
+                case PGT_hl2_shadow: // haven't thought about this case yet.
+                default:
+                    BUG();
+                    break;
+                }
+
+                a = a->next;
+            }
+        }
+    }
+
+    /* PHASE 3 */
+    ctot = ttot = page_count = l1 = l2 = oos_count = 0;
+
+    audit_adjust_pgtables(d, 1, 0);
+
+#if 0
+    // This covers our sins of trashing the tlbflush_timestamps...
+    //
+    local_flush_tlb();
+#endif
+
+    spin_unlock(&d->page_alloc_lock);
+
+    if ( !(flags & AUDIT_QUIET) )
+        printk("Audit dom%d (%s:%d) Done. "
+               "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n",
+               d->id, file, line, page_count, oos_count, l1, l2, ctot, ttot );
+
+    if ( !(flags & AUDIT_ALREADY_LOCKED) )
+        shadow_unlock(d);
+
+    if ( d != current->domain )
+        domain_unpause(d);
+
+    if ( errors && !(flags & AUDIT_ERRORS_OK) )
+        BUG();
+}
+
+void audit_domains(void)
+{
+    struct domain *d;
+    for_each_domain ( d )
+        audit_domain(d);
+}
+
+void audit_domains_key(unsigned char key)
+{
+    audit_domains();
+}
+#endif
index f2b46e8c07f387b8899787c0f0c47c135ba1594b..029d5fd5a4f1318ef5d92c801b71ae52e5f1181a 100644 (file)
@@ -247,10 +247,9 @@ void arch_do_createdomain(struct exec_domain *ed)
         machine_to_phys_mapping[virt_to_phys(d->arch.mm_perdomain_pt) >> 
                                PAGE_SHIFT] = INVALID_M2P_ENTRY;
         ed->arch.perdomain_ptes = d->arch.mm_perdomain_pt;
-#if 0 /* don't need this yet, but maybe soon! */
-        ed->arch.guest_vtable = linear_l2_table;
-        ed->arch.shadow_vtable = shadow_linear_l2_table;
-#endif
+
+        ed->arch.guest_vtable  = __linear_l2_table;
+        ed->arch.shadow_vtable = __shadow_linear_l2_table;
 
 #ifdef __x86_64__
         d->arch.mm_perdomain_l2 = (l2_pgentry_t *)alloc_xenheap_page();
@@ -295,70 +294,6 @@ void arch_vmx_do_launch(struct exec_domain *ed)
     reset_stack_and_jump(vmx_asm_do_launch);
 }
 
-unsigned long alloc_monitor_pagetable(struct exec_domain *ed)
-{
-    unsigned long mmfn;
-    l2_pgentry_t *mpl2e;
-    struct pfn_info *mmfn_info;
-    struct domain *d = ed->domain;
-
-    ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */
-
-    mmfn_info = alloc_domheap_page(NULL);
-    ASSERT( mmfn_info ); 
-
-    mmfn = (unsigned long) (mmfn_info - frame_table);
-    mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
-    memset(mpl2e, 0, PAGE_SIZE);
-
-    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
-           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
-    mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
-        mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) 
-                      | __PAGE_HYPERVISOR);
-
-    ed->arch.monitor_vtable = mpl2e;
-
-    // map the phys_to_machine map into the Read-Only MPT space for this domain
-    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
-        mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR);
-
-    return mmfn;
-}
-
-/*
- * Free the pages for monitor_table and hl2_table
- */
-static void free_monitor_pagetable(struct exec_domain *ed)
-{
-    l2_pgentry_t *mpl2e;
-    unsigned long mfn;
-
-    ASSERT( pagetable_val(ed->arch.monitor_table) );
-    
-    mpl2e = ed->arch.monitor_vtable;
-
-    /*
-     * First get the mfn for hl2_table by looking at monitor_table
-     */
-    mfn = l2_pgentry_val(mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])
-        >> PAGE_SHIFT;
-
-    free_domheap_page(&frame_table[mfn]);
-    unmap_domain_mem(mpl2e);
-
-    /*
-     * Then free monitor_table.
-     */
-    mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
-    free_domheap_page(&frame_table[mfn]);
-
-    ed->arch.monitor_table = mk_pagetable(0);
-    ed->arch.monitor_vtable = 0;
-}
-
 static int vmx_final_setup_guest(struct exec_domain *ed,
                                    full_execution_context_t *full_context)
 {
index 75130fe1c607bc054b9a916bcefa92ec37a26c4c..ae2b31d495d6e2c59e50d54c7348fdb666d7b9e6 100644 (file)
@@ -25,6 +25,9 @@
 static unsigned int opt_dom0_mem = 0;
 integer_param("dom0_mem", opt_dom0_mem);
 
+static unsigned int opt_dom0_shadow = 0;
+boolean_param("dom0_shadow", opt_dom0_shadow);
+
 #if defined(__i386__)
 /* No ring-3 access in initial leaf page tables. */
 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
@@ -267,8 +270,13 @@ int construct_dom0(struct domain *d,
     l1tab += l1_table_offset(vpt_start);
     for ( count = 0; count < nr_pt_pages; count++ ) 
     {
-        *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
         page = &frame_table[l1_pgentry_to_pfn(*l1tab)];
+        if ( !opt_dom0_shadow )
+            *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
+        else
+            if ( !get_page_type(page, PGT_writable_page) )
+                BUG();
+
         if ( count == 0 )
         {
             page->u.inuse.type_info &= ~PGT_type_mask;
@@ -512,6 +520,12 @@ int construct_dom0(struct domain *d,
 
     new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
 
+    if ( opt_dom0_shadow )
+    {
+        shadow_mode_enable(d, SHM_enable); 
+        update_pagetables(ed); /* XXX SMP */
+    }
+
     return 0;
 }
 
index 72ba2e279738df52113493661dc1e14fd92a0ed9..c70b5bbe1cb604e64d681d621f71347e39dfc2f3 100644 (file)
 
 #ifdef VERBOSE
 #define MEM_LOG(_f, _a...)                           \
-  printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
+  printk("DOM%u: MEM_LOG(line=%d) " _f "\n", \
          current->domain->id , __LINE__ , ## _a )
 #else
 #define MEM_LOG(_f, _a...) ((void)0)
 #endif
 
-static int alloc_l2_table(struct pfn_info *page);
-static int alloc_l1_table(struct pfn_info *page);
-static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
-static int get_page_and_type_from_pagenr(unsigned long page_nr, 
-                                         u32 type,
-                                         struct domain *d);
-
 static void free_l2_table(struct pfn_info *page);
 static void free_l1_table(struct pfn_info *page);
 
@@ -222,7 +215,7 @@ static void __invalidate_shadow_ldt(struct exec_domain *d)
 }
 
 
-static inline void invalidate_shadow_ldt(struct exec_domain *d)
+void invalidate_shadow_ldt(struct exec_domain *d)
 {
     if ( d->arch.shadow_ldt_mapcnt != 0 )
         __invalidate_shadow_ldt(d);
@@ -254,21 +247,41 @@ int map_ldt_shadow_page(unsigned int off)
 {
     struct exec_domain *ed = current;
     struct domain *d = ed->domain;
-    unsigned long l1e;
+    unsigned long l1e, nl1e, gpfn, gmfn;
+    unsigned gva = ed->arch.ldt_base + (off << PAGE_SHIFT);
+    int res;
 
     if ( unlikely(in_irq()) )
         BUG();
 
-    __get_user(l1e, (unsigned long *)
-               &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]);
+    shadow_sync_va(ed, gva);
+    __get_user(l1e, (unsigned long *)&linear_pg_table[l1_linear_offset(gva)]);
+
+    if ( unlikely(!(l1e & _PAGE_PRESENT)) )
+        return 0;
+
+    gpfn = l1_pgentry_to_pfn(mk_l1_pgentry(l1e));
+    gmfn = __gpfn_to_mfn(d, gpfn);
+    if ( unlikely(!gmfn) )
+        return 0;
+
+    if ( unlikely(shadow_mode_enabled(d)) )
+    {
+        shadow_lock(d);
+        shadow_remove_all_write_access(d, PGT_l1_shadow, PGT_l1_shadow, gpfn);
+    }
+
+    res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page);
+
+    if ( unlikely(shadow_mode_enabled(d)) )
+        shadow_unlock(d);
 
-    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
-         unlikely(!get_page_and_type(
-             &frame_table[l1_pgentry_to_pfn(mk_l1_pgentry(l1e))],
-             d, PGT_ldt_page)) )
+    if ( unlikely(!res) )
         return 0;
 
-    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
+    nl1e = (l1e & ~PAGE_MASK) | (gmfn << PAGE_SHIFT) | _PAGE_RW;
+
+    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(nl1e);
     ed->arch.shadow_ldt_mapcnt++;
 
     return 1;
@@ -337,6 +350,8 @@ get_linear_pagetable(
     struct pfn_info *page;
     unsigned long pfn;
 
+    ASSERT( !shadow_mode_enabled(d) );
+
     if ( (root_pgentry_val(re) & _PAGE_RW) )
     {
         MEM_LOG("Attempt to create linear p.t. with write perms");
@@ -372,13 +387,13 @@ get_linear_pagetable(
 }
 
 
-static int
+int
 get_page_from_l1e(
     l1_pgentry_t l1e, struct domain *d)
 {
     unsigned long l1v = l1_pgentry_val(l1e);
-    unsigned long pfn = l1_pgentry_to_pfn(l1e);
-    struct pfn_info *page = &frame_table[pfn];
+    unsigned long mfn = l1_pgentry_to_pfn(l1e);
+    struct pfn_info *page = &frame_table[mfn];
     extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
 
     if ( !(l1v & _PAGE_PRESENT) )
@@ -386,11 +401,11 @@ get_page_from_l1e(
 
     if ( unlikely(l1v & L1_DISALLOW_MASK) )
     {
-        MEM_LOG("Bad L1 type settings %p", l1v & L1_DISALLOW_MASK);
+        MEM_LOG("Bad L1 type settings %p %p", l1v, l1v & L1_DISALLOW_MASK);
         return 0;
     }
 
-    if ( unlikely(!pfn_is_ram(pfn)) )
+    if ( unlikely(!pfn_is_ram(mfn)) )
     {
         /* Revert to caller privileges if FD == DOMID_IO. */
         if ( d == dom_io )
@@ -400,9 +415,9 @@ get_page_from_l1e(
             return 1;
 
         if ( IS_CAPABLE_PHYSDEV(d) )
-            return domain_iomem_in_pfn(d, pfn);
+            return domain_iomem_in_pfn(d, mfn);
 
-        MEM_LOG("Non-privileged attempt to map I/O space %p", pfn);
+        MEM_LOG("Non-privileged attempt to map I/O space %p", mfn);
         return 0;
     }
 
@@ -420,6 +435,8 @@ get_page_from_l2e(
 {
     int rc;
 
+    ASSERT( !shadow_mode_enabled(d) );
+
     if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
         return 1;
 
@@ -491,7 +508,7 @@ get_page_from_l4e(
 #endif /* __x86_64__ */
 
 
-static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
+void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
 {
     unsigned long    l1v  = l1_pgentry_val(l1e);
     unsigned long    pfn  = l1_pgentry_to_pfn(l1e);
@@ -530,6 +547,8 @@ static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
         if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
                        PGT_ldt_page)) &&
              unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
+
+            // XXX SMP BUG?
             invalidate_shadow_ldt(e->exec_domain[0]);
         put_page(page);
     }
@@ -575,6 +594,8 @@ static int alloc_l1_table(struct pfn_info *page)
     l1_pgentry_t  *pl1e;
     int            i;
 
+    ASSERT( !shadow_mode_enabled(d) );
+
     pl1e = map_domain_mem(pfn << PAGE_SHIFT);
 
     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
@@ -601,6 +622,11 @@ static int alloc_l2_table(struct pfn_info *page)
     unsigned long  pfn = page_to_pfn(page);
     l2_pgentry_t  *pl2e;
     int            i;
+
+    if ( (PGT_base_page_table == PGT_l2_page_table) &&
+         shadow_mode_enabled(d) )
+        return 1;
+    ASSERT( !shadow_mode_enabled(d) );
    
     pl2e = map_domain_mem(pfn << PAGE_SHIFT);
 
@@ -643,6 +669,8 @@ static int alloc_l3_table(struct pfn_info *page)
     l3_pgentry_t  *pl3e = page_to_virt(page);
     int            i;
 
+    ASSERT( !shadow_mode_enabled(d) );
+
     for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
         if ( is_guest_l3_slot(i) &&
              unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
@@ -666,6 +694,11 @@ static int alloc_l4_table(struct pfn_info *page)
     l4_pgentry_t  *pl4e = page_to_virt(page);
     int            i;
 
+    if ( (PGT_base_page_table == PGT_l4_page_table) &&
+         shadow_mode_enabled(d) )
+        return 1;
+    ASSERT( !shadow_mode_enabled(d) );
+
     for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
         if ( is_guest_l4_slot(i) &&
              unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
@@ -765,7 +798,7 @@ static inline int update_l1e(l1_pgentry_t *pl1e,
     if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
          unlikely(o != l1_pgentry_val(ol1e)) )
     {
-        MEM_LOG("Failed to update %p -> %p: saw %p\n",
+        MEM_LOG("Failed to update %p -> %p: saw %p",
                 l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
         return 0;
     }
@@ -781,6 +814,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
     unsigned long _ol1e;
     struct domain *d = current->domain;
 
+    ASSERT( !shadow_mode_enabled(d) );
+
     if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
         return 0;
     ol1e = mk_l1_pgentry(_ol1e);
@@ -807,13 +842,12 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
             put_page_from_l1e(nl1e, d);
             return 0;
         }
-        
-        put_page_from_l1e(ol1e, d);
-        return 1;
     }
-
-    if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
-        return 0;
+    else
+    {
+        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+            return 0;
+    }
     
     put_page_from_l1e(ol1e, d);
     return 1;
@@ -825,7 +859,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
                                 _t ## _pgentry_val(_o),                 \
                                 _t ## _pgentry_val(_n));                \
     if ( __o != _t ## _pgentry_val(_o) )                                \
-        MEM_LOG("Failed to update %p -> %p: saw %p\n",                  \
+        MEM_LOG("Failed to update %p -> %p: saw %p",                    \
                 _t ## _pgentry_val(_o), _t ## _pgentry_val(_n), __o);   \
     (__o == _t ## _pgentry_val(_o)); })
 
@@ -872,13 +906,12 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
             put_page_from_l2e(nl2e, pfn);
             return 0;
         }
-        
-        put_page_from_l2e(ol2e, pfn);
-        return 1;
     }
-
-    if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
-        return 0;
+    else
+    {
+        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+            return 0;
+    }
 
     put_page_from_l2e(ol2e, pfn);
     return 1;
@@ -1025,7 +1058,9 @@ int alloc_page_type(struct pfn_info *page, unsigned int type)
 
 void free_page_type(struct pfn_info *page, unsigned int type)
 {
-    struct domain *d = page_get_owner(page);
+    struct domain *owner = page_get_owner(page);
+    if ( likely(owner != NULL) && unlikely(shadow_mode_enabled(owner)) )
+        return;
 
     switch ( type )
     {
@@ -1050,13 +1085,6 @@ void free_page_type(struct pfn_info *page, unsigned int type)
     default:
         BUG();
     }
-
-    if ( unlikely(shadow_mode_enabled(d)) && 
-         (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
-    {
-        unshadow_table(page_to_pfn(page), type);
-        put_shadow_status(d);
-    }
 }
 
 
@@ -1096,15 +1124,16 @@ void put_page_type(struct pfn_info *page)
                 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
                                            x & ~PGT_validated)) != x) )
                     goto again;
-                /* We cleared the 'valid bit' so we do the clear up. */
+                /* We cleared the 'valid bit' so we do the clean up. */
                 free_page_type(page, x & PGT_type_mask);
                 /* Carry on, but with the 'valid bit' now clear. */
                 x  &= ~PGT_validated;
                 nx &= ~PGT_validated;
             }
         }
-        else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
-                           (PGT_pinned | 1)) )
+        else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) == 
+                            (PGT_pinned | 1)) &&
+                           ((nx & PGT_type_mask) != PGT_writable_page)) )
         {
             /* Page is now only pinned. Make the back pointer mutable again. */
             nx |= PGT_va_mutable;
@@ -1124,7 +1153,7 @@ int get_page_type(struct pfn_info *page, u32 type)
         nx = x + 1;
         if ( unlikely((nx & PGT_count_mask) == 0) )
         {
-            MEM_LOG("Type count overflow on pfn %p\n", page_to_pfn(page));
+            MEM_LOG("Type count overflow on pfn %p", page_to_pfn(page));
             return 0;
         }
         else if ( unlikely((x & PGT_count_mask) == 0) )
@@ -1137,6 +1166,8 @@ int get_page_type(struct pfn_info *page, u32 type)
                  * circumstances should be very rare.
                  */
                 struct domain *d = page_get_owner(page);
+
+                // XXX SMP bug?
                 if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->
                                                       processor],
                                          page->tlbflush_timestamp)) )
@@ -1155,14 +1186,24 @@ int get_page_type(struct pfn_info *page, u32 type)
                     nx |= PGT_validated;
             }
         }
+        else if ( unlikely(!(x & PGT_validated)) )
+        {
+            /* Someone else is updating validation of this page. Wait... */
+            while ( (y = page->u.inuse.type_info) == x )
+            {
+                rep_nop();
+                barrier();
+            }
+            goto again;
+        }
         else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
         {
             if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
             {
                 if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
                      ((type & PGT_type_mask) != PGT_l1_page_table) )
-                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p\n",
-                            x & PGT_type_mask, type, page_to_pfn(page));
+                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p",
+                            x, type, page_to_pfn(page));
                 return 0;
             }
             else if ( (x & PGT_va_mask) == PGT_va_mutable )
@@ -1178,16 +1219,6 @@ int get_page_type(struct pfn_info *page, u32 type)
                 nx |= PGT_va_unknown;
             }
         }
-        else if ( unlikely(!(x & PGT_validated)) )
-        {
-            /* Someone else is updating validation of this page. Wait... */
-            while ( (y = page->u.inuse.type_info) == x )
-            {
-                rep_nop();
-                barrier();
-            }
-            goto again;
-        }
     }
     while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
 
@@ -1197,7 +1228,7 @@ int get_page_type(struct pfn_info *page, u32 type)
         if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
         {
             MEM_LOG("Error while validating pfn %p for type %08x."
-                    " caf=%08x taf=%08x\n",
+                    " caf=%08x taf=%08x",
                     page_to_pfn(page), type,
                     page->count_info,
                     page->u.inuse.type_info);
@@ -1214,30 +1245,36 @@ int get_page_type(struct pfn_info *page, u32 type)
 }
 
 
-int new_guest_cr3(unsigned long pfn)
+int new_guest_cr3(unsigned long mfn)
 {
     struct exec_domain *ed = current;
     struct domain *d = ed->domain;
-    int okay, cpu = smp_processor_id();
-    unsigned long old_base_pfn;
-    
-    okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d);
+    int okay;
+    unsigned long old_base_mfn;
+
+    if ( shadow_mode_enabled(d) )
+        okay = get_page_from_pagenr(mfn, d);
+    else
+        okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
+
     if ( likely(okay) )
     {
         invalidate_shadow_ldt(ed);
 
-        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
-        old_base_pfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
-        ed->arch.guest_table = mk_pagetable(pfn << PAGE_SHIFT);
+        old_base_mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
+        ed->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
         update_pagetables(ed); /* update shadow_table and monitor_table */
 
         write_ptbase(ed);
 
-        put_page_and_type(&frame_table[old_base_pfn]);
+        if ( shadow_mode_enabled(d) )
+            put_page(&frame_table[old_base_mfn]);
+        else
+            put_page_and_type(&frame_table[old_base_mfn]);
     }
     else
     {
-        MEM_LOG("Error while installing new baseptr %p", pfn);
+        MEM_LOG("Error while installing new baseptr %p", mfn);
     }
 
     return okay;
@@ -1247,10 +1284,11 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
 {
     int okay = 1, cpu = smp_processor_id();
     unsigned int cmd = val & MMUEXT_CMD_MASK, type;
-    unsigned long pfn = ptr >> PAGE_SHIFT;
-    struct pfn_info *page = &frame_table[pfn];
     struct exec_domain *ed = current;
     struct domain *d = ed->domain, *e;
+    unsigned long gpfn = ptr >> PAGE_SHIFT;
+    unsigned long mfn = __gpfn_to_mfn(d, gpfn);
+    struct pfn_info *page = &frame_table[mfn];
     u32 x, y, _d, _nd;
     domid_t domid;
     grant_ref_t gntref;
@@ -1266,17 +1304,29 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
         type = PGT_l1_page_table | PGT_va_mutable;
 
     pin_page:
-        okay = get_page_and_type_from_pagenr(pfn, type, FOREIGNDOM);
+        if ( unlikely(percpu_info[cpu].foreign &&
+                      (shadow_mode_translate(d) ||
+                       shadow_mode_translate(percpu_info[cpu].foreign))) )
+        {
+            // oops -- we should be using the foreign domain's P2M
+            mfn = __gpfn_to_mfn(FOREIGNDOM, gpfn);
+            page = &frame_table[mfn];
+        }
+
+        if ( shadow_mode_enabled(FOREIGNDOM) )
+            type = PGT_writable_page;
+
+        okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
         if ( unlikely(!okay) )
         {
-            MEM_LOG("Error while pinning pfn %p", pfn);
+            MEM_LOG("Error while pinning mfn %p", mfn);
             break;
         }
 
         if ( unlikely(test_and_set_bit(_PGT_pinned,
                                        &page->u.inuse.type_info)) )
         {
-            MEM_LOG("Pfn %p already pinned", pfn);
+            MEM_LOG("mfn %p already pinned", mfn);
             put_page_and_type(page);
             okay = 0;
             break;
@@ -1299,10 +1349,19 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
 #endif /* __x86_64__ */
 
     case MMUEXT_UNPIN_TABLE:
-        if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
+        if ( unlikely(percpu_info[cpu].foreign &&
+                      (shadow_mode_translate(d) ||
+                       shadow_mode_translate(percpu_info[cpu].foreign))) )
+        {
+            // oops -- we should be using the foreign domain's P2M
+            mfn = __gpfn_to_mfn(FOREIGNDOM, gpfn);
+            page = &frame_table[mfn];
+        }
+
+        if ( unlikely(!(okay = get_page_from_pagenr(mfn, FOREIGNDOM))) )
         {
-            MEM_LOG("Page %p bad domain (dom=%p)",
-                    ptr, page_get_owner(page));
+            MEM_LOG("mfn %p bad domain (dom=%p)",
+                    mfn, page_get_owner(page));
         }
         else if ( likely(test_and_clear_bit(_PGT_pinned, 
                                             &page->u.inuse.type_info)) )
@@ -1314,28 +1373,29 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
         {
             okay = 0;
             put_page(page);
-            MEM_LOG("Pfn %p not pinned", pfn);
+            MEM_LOG("mfn %p not pinned", mfn);
         }
         break;
 
     case MMUEXT_NEW_BASEPTR:
-        okay = new_guest_cr3(pfn);
+        okay = new_guest_cr3(mfn);
+        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
         break;
         
 #ifdef __x86_64__
     case MMUEXT_NEW_USER_BASEPTR:
-        okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d);
+        okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
         if ( unlikely(!okay) )
         {
-            MEM_LOG("Error while installing new baseptr %p", pfn);
+            MEM_LOG("Error while installing new baseptr %p", mfn);
         }
         else
         {
-            unsigned long old_pfn =
+            unsigned long old_mfn =
                 pagetable_val(ed->arch.guest_table_user) >> PAGE_SHIFT;
-            ed->arch.guest_table_user = mk_pagetable(pfn << PAGE_SHIFT);
-            if ( old_pfn != 0 )
-                put_page_and_type(&frame_table[old_pfn]);
+            ed->arch.guest_table_user = mk_pagetable(mfn << PAGE_SHIFT);
+            if ( old_mfn != 0 )
+                put_page_and_type(&frame_table[old_mfn]);
         }
         break;
 #endif
@@ -1346,12 +1406,14 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
     
     case MMUEXT_INVLPG:
         __flush_tlb_one(ptr);
+        if ( shadow_mode_enabled(d) )
+            shadow_invlpg(ed, ptr);
         break;
 
     case MMUEXT_FLUSH_CACHE:
         if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
         {
-            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
+            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.");
             okay = 0;
         }
         else
@@ -1362,6 +1424,8 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
 
     case MMUEXT_SET_LDT:
     {
+        ASSERT( !shadow_mode_external(d) );
+
         unsigned long ents = val >> MMUEXT_CMD_SHIFT;
         if ( ((ptr & (PAGE_SIZE-1)) != 0) || 
              (ents > 8192) ||
@@ -1375,6 +1439,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
                   (ed->arch.ldt_base != ptr) )
         {
             invalidate_shadow_ldt(ed);
+            shadow_sync_all(d);
             ed->arch.ldt_base = ptr;
             ed->arch.ldt_ents = ents;
             load_LDT(ed);
@@ -1401,7 +1466,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
                 percpu_info[cpu].foreign = dom_io;
                 break;
             default:
-                MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
+                MEM_LOG("Dom %u cannot set foreign dom", d->id);
                 okay = 0;
                 break;
             }
@@ -1435,10 +1500,10 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
         gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
         
         if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
-             unlikely(!pfn_is_ram(pfn)) ||
+             unlikely(!pfn_is_ram(mfn)) ||
              unlikely((e = find_domain_by_id(domid)) == NULL) )
         {
-            MEM_LOG("Bad frame (%p) or bad domid (%d).\n", pfn, domid);
+            MEM_LOG("Bad frame (%p) or bad domid (%d).", mfn, domid);
             okay = 0;
             break;
         }
@@ -1460,7 +1525,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
                  unlikely(_nd != _d) )
             {
                 MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
-                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
+                        " caf=%08x, taf=%08x", page_to_pfn(page),
                         d, d->id, unpickle_domptr(_nd), x, 
                         page->u.inuse.type_info);
                 spin_unlock(&d->page_alloc_lock);
@@ -1496,7 +1561,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
              unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
         {
             MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
-                    "provided a bad grant ref, or is dying (%p).\n",
+                    "provided a bad grant ref, or is dying (%p).",
                     e->tot_pages, e->max_pages, e->d_flags);
             spin_unlock(&e->page_alloc_lock);
             put_domain(e);
@@ -1513,7 +1578,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
         spin_unlock(&e->page_alloc_lock);
 
         /* Transfer is all done: tell the guest about its new page frame. */
-        gnttab_notify_transfer(e, gntref, pfn);
+        gnttab_notify_transfer(e, gntref, mfn);
         
         put_domain(e);
         break;
@@ -1529,7 +1594,14 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
         e = percpu_info[cpu].foreign;
         if ( unlikely(e == NULL) )
         {
-            MEM_LOG("No FOREIGNDOM to reassign pfn %p to", pfn);
+            MEM_LOG("No FOREIGNDOM to reassign mfn %p to", mfn);
+            okay = 0;
+            break;
+        }
+
+        if ( unlikely(!pfn_is_ram(mfn)) )
+        {
+            MEM_LOG("Can't reassign non-ram mfn %p", mfn);
             okay = 0;
             break;
         }
@@ -1574,7 +1646,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
                  unlikely(_nd != _d) )
             {
                 MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
-                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
+                        " caf=%08x, taf=%08x", page_to_pfn(page),
                         d, d->id, unpickle_domptr(_nd), x,
                         page->u.inuse.type_info);
                 okay = 0;
@@ -1637,12 +1709,10 @@ int do_mmu_update(
 #define MMU_UPDATE_PREEMPT_FDOM_MASK  (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
 
     mmu_update_t req;
-    unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
+    unsigned long va = 0, deferred_ops, gpfn, mfn, prev_mfn = 0;
     struct pfn_info *page;
     int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
     unsigned int cmd, done = 0;
-    unsigned long prev_smfn = 0;
-    l1_pgentry_t *prev_spl1e = 0;
     struct exec_domain *ed = current;
     struct domain *d = ed->domain;
     u32 type_info;
@@ -1653,10 +1723,9 @@ int do_mmu_update(
     cleanup_writable_pagetable(d);
 
     if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(d, ed->arch.guest_table, "pre-mmu"); /* debug */
-
-    if ( unlikely(shadow_mode_translate(d) ) )
-        domain_crash();
+    {
+        check_pagetable(ed, "pre-mmu"); /* debug */
+    }
 
     /*
      * If we are resuming after preemption, read how much work we have already
@@ -1715,7 +1784,8 @@ int do_mmu_update(
         }
 
         cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
-        pfn = req.ptr >> PAGE_SHIFT;
+        gpfn = req.ptr >> PAGE_SHIFT;
+        mfn = __gpfn_to_mfn(d, gpfn);
 
         okay = 0;
 
@@ -1725,107 +1795,91 @@ int do_mmu_update(
              * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
              */
         case MMU_NORMAL_PT_UPDATE:
-            if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
+            if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) )
             {
                 MEM_LOG("Could not get page for normal update");
                 break;
             }
 
-            if ( likely(prev_pfn == pfn) )
+            if ( likely(prev_mfn == mfn) )
             {
                 va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
             }
             else
             {
-                if ( prev_pfn != 0 )
+                if ( prev_mfn != 0 )
                     unmap_domain_mem((void *)va);
                 va = (unsigned long)map_domain_mem(req.ptr);
-                prev_pfn = pfn;
+                prev_mfn = mfn;
             }
 
-            page = &frame_table[pfn];
+            page = &frame_table[mfn];
             switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
             {
             case PGT_l1_page_table: 
+                ASSERT(!shadow_mode_enabled(d));
                 if ( likely(get_page_type(
                     page, type_info & (PGT_type_mask|PGT_va_mask))) )
                 {
                     okay = mod_l1_entry((l1_pgentry_t *)va, 
-                                        mk_l1_pgentry(req.val)); 
-
-                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
-                         (get_shadow_status(d, page-frame_table) &
-                          PSH_shadowed) )
-                    {
-                        shadow_l1_normal_pt_update(
-                            req.ptr, req.val, &prev_smfn, &prev_spl1e);
-                        put_shadow_status(d);
-                    }
-
+                                        mk_l1_pgentry(req.val));
                     put_page_type(page);
                 }
                 break;
             case PGT_l2_page_table:
+                ASSERT(!shadow_mode_enabled(d));
                 if ( likely(get_page_type(page, PGT_l2_page_table)) )
                 {
                     okay = mod_l2_entry((l2_pgentry_t *)va, 
                                         mk_l2_pgentry(req.val),
-                                        pfn); 
-
-                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
-                         (get_shadow_status(d, page-frame_table) & 
-                          PSH_shadowed) )
-                    {
-                        shadow_l2_normal_pt_update(req.ptr, req.val);
-                        put_shadow_status(d);
-                    }
-
+                                        mfn);
                     put_page_type(page);
                 }
                 break;
 #ifdef __x86_64__
             case PGT_l3_page_table:
+                ASSERT(!shadow_mode_enabled(d));
                 if ( likely(get_page_type(page, PGT_l3_page_table)) )
                 {
                     okay = mod_l3_entry((l3_pgentry_t *)va, 
                                         mk_l3_pgentry(req.val),
-                                        pfn); 
-
-                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
-                         (get_shadow_status(d, page-frame_table) & 
-                          PSH_shadowed) )
-                    {
-                        /*XXXshadow_l3_normal_pt_update(req.ptr, req.val);*/
-                        put_shadow_status(d);
-                    }
-
+                                        mfn);
                     put_page_type(page);
                 }
                 break;
             case PGT_l4_page_table:
+                ASSERT(!shadow_mode_enabled(d));
                 if ( likely(get_page_type(page, PGT_l4_page_table)) )
                 {
                     okay = mod_l4_entry((l4_pgentry_t *)va, 
                                         mk_l4_pgentry(req.val),
-                                        pfn); 
-
-                    if ( unlikely(shadow_mode_enabled(d)) && okay &&
-                         (get_shadow_status(d, page-frame_table) & 
-                          PSH_shadowed) )
-                    {
-                        /*XXXshadow_l4_normal_pt_update(req.ptr, req.val);*/
-                        put_shadow_status(d);
-                    }
-
+                                        mfn);
                     put_page_type(page);
                 }
                 break;
 #endif /* __x86_64__ */
             default:
+                printk("do_mmu_update writable update: ma=%p val=%p\n",
+                       req.ptr, req.val);
                 if ( likely(get_page_type(page, PGT_writable_page)) )
                 {
+                    if ( shadow_mode_enabled(d) )
+                    {
+                        shadow_lock(d);
+
+                        if ( shadow_mode_log_dirty(d) )
+                            __mark_dirty(d, mfn);
+
+                        if ( page_is_page_table(page) )
+                            shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
+                    }
+
                     *(unsigned long *)va = req.val;
                     okay = 1;
+
+                    if ( shadow_mode_enabled(d) )
+                        shadow_unlock(d);
+
                     put_page_type(page);
                 }
                 break;
@@ -1835,24 +1889,30 @@ int do_mmu_update(
             break;
 
         case MMU_MACHPHYS_UPDATE:
-            if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
+            if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) )
             {
                 MEM_LOG("Could not get page for mach->phys update");
                 break;
             }
 
-            machine_to_phys_mapping[pfn] = req.val;
+            if ( unlikely(shadow_mode_translate(FOREIGNDOM) && !IS_PRIV(d)) )
+            {
+                MEM_LOG("can't mutate the m2p of translated guests");
+                break;
+            }
+
+            set_machinetophys(mfn, req.val);
             okay = 1;
 
             /*
-             * If in log-dirty mode, mark the corresponding pseudo-physical
+             * If in log-dirty mode, mark the corresponding
              * page as dirty.
              */
-            if ( unlikely(shadow_mode_log_dirty(d)) && 
-                 mark_dirty(d, pfn) )
-                d->arch.shadow_dirty_block_count++;
+            if ( unlikely(shadow_mode_log_dirty(FOREIGNDOM)) &&
+                 mark_dirty(FOREIGNDOM, mfn) )
+                FOREIGNDOM->arch.shadow_dirty_block_count++;
 
-            put_page(&frame_table[pfn]);
+            put_page(&frame_table[mfn]);
             break;
 
             /*
@@ -1879,17 +1939,18 @@ int do_mmu_update(
     }
 
  out:
-    if ( prev_pfn != 0 )
+    if ( prev_mfn != 0 )
         unmap_domain_mem((void *)va);
 
-    if ( unlikely(prev_spl1e != 0) ) 
-        unmap_domain_mem((void *)prev_spl1e);
-
     deferred_ops = percpu_info[cpu].deferred_ops;
     percpu_info[cpu].deferred_ops = 0;
 
     if ( deferred_ops & DOP_FLUSH_TLB )
+    {
         local_flush_tlb();
+        if ( shadow_mode_enabled(d) )
+            shadow_sync_all(d);
+    }
         
     if ( deferred_ops & DOP_RELOAD_LDT )
         (void)map_ldt_shadow_page(0);
@@ -1905,7 +1966,7 @@ int do_mmu_update(
         __put_user(done + i, pdone);
 
     if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(d, ed->arch.guest_table, "post-mmu"); /* debug */
+        check_pagetable(ed, "post-mmu"); /* debug */
 
     UNLOCK_BIGLOCK(d);
     return rc;
@@ -1924,12 +1985,9 @@ int do_update_va_mapping(unsigned long va,
 
     perfc_incrc(calls_to_update_va);
 
-    if ( unlikely(!__addr_ok(va)) )
+    if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
         return -EINVAL;
 
-    if ( unlikely(shadow_mode_translate(d) ) )
-        domain_crash();
-
     LOCK_BIGLOCK(d);
 
     cleanup_writable_pagetable(d);
@@ -1938,55 +1996,56 @@ int do_update_va_mapping(unsigned long va,
      * XXX When we make this support 4MB superpages we should also deal with 
      * the case of updating L2 entries.
      */
-
-    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
-                                mk_l1_pgentry(val))) )
-        err = -EINVAL;
-
-    if ( unlikely(shadow_mode_enabled(d)) )
+    if ( likely(!shadow_mode_enabled(d)) )
     {
-        unsigned long sval = 0;
+        if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
+                                    mk_l1_pgentry(val))) )
+            err = -EINVAL;
+    }
+    else
+    {
+        if ( unlikely(percpu_info[cpu].foreign &&
+                      (shadow_mode_translate(d) ||
+                       shadow_mode_translate(percpu_info[cpu].foreign))) )
+        {
+            // The foreign domain's pfn's are in a different namespace.
+            // We wouldn't be able to figure out how to (re-)shadow our
+            // gpte without additional context.
+            //
+            domain_crash();
+        }
+    
+        check_pagetable(ed, "pre-va"); /* debug */
+        shadow_lock(d);
+        
+        // This is actually overkill - we don't need to sync the L1 itself,
+        // just everything involved in getting to this L1 (i.e. we need
+        // linear_pg_table[l1_linear_offset(va)] to be in sync)...
+        //
+        __shadow_sync_va(ed, va);
+
+        if ( unlikely(__put_user(val, &l1_pgentry_val(
+                                     linear_pg_table[l1_linear_offset(va)]))) )
+            err = -EINVAL;
+        else
+        {
+            // also need to update the shadow
+            unsigned long spte;
 
-        l1pte_propagate_from_guest(d, &val, &sval);
+            l1pte_propagate_from_guest(d, val, &spte);
+            shadow_set_l1e(va, spte, 0);
 
-        if ( unlikely(__put_user(sval, ((unsigned long *)(
-            &shadow_linear_pg_table[l1_linear_offset(va)])))) )
-        {
             /*
-             * Since L2's are guranteed RW, failure indicates either that the
-             * page was not shadowed, or that the L2 entry has not yet been
-             * updated to reflect the shadow.
+             * If we're in log-dirty mode then we need to note that we've updated
+             * the PTE in the PT-holding page. We need the machine frame number
+             * for this.
              */
-            if ( shadow_mode_external(current->domain) )
-                BUG(); // can't use linear_l2_table with external tables.
-
-            l2_pgentry_t gpde = linear_l2_table[l2_table_offset(va)];
-            unsigned long gpfn = l2_pgentry_val(gpde) >> PAGE_SHIFT;
+            if ( shadow_mode_log_dirty(d) )
+                mark_dirty(d, va_to_l1mfn(ed, va));
 
-            if (get_shadow_status(d, gpfn))
-            {
-                unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
-                unsigned long *gl1e = map_domain_mem(gmfn << PAGE_SHIFT);
-                unsigned l1_idx = l1_table_offset(va);
-                gl1e[l1_idx] = sval;
-                unmap_domain_mem(gl1e);
-                put_shadow_status(d);
-
-                perfc_incrc(shadow_update_va_fail1);
-            }
-            else
-                perfc_incrc(shadow_update_va_fail2);
+            shadow_unlock(d);
+            check_pagetable(ed, "post-va"); /* debug */
         }
-
-        /*
-         * If we're in log-dirty mode then we need to note that we've updated
-         * the PTE in the PT-holding page. We need the machine frame number
-         * for this.
-         */
-        if ( shadow_mode_log_dirty(d) )
-            mark_dirty(d, va_to_l1mfn(va));
-  
-        check_pagetable(d, ed->arch.guest_table, "va"); /* debug */
     }
 
     deferred_ops = percpu_info[cpu].deferred_ops;
@@ -1994,9 +2053,17 @@ int do_update_va_mapping(unsigned long va,
 
     if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
          unlikely(flags & UVMF_FLUSH_TLB) )
+    {
         local_flush_tlb();
+        if ( unlikely(shadow_mode_enabled(d)) )
+            shadow_sync_all(d);
+    }
     else if ( unlikely(flags & UVMF_INVLPG) )
+    {
         __flush_tlb_one(va);
+        if ( unlikely(shadow_mode_enabled(d)) )
+            shadow_invlpg(current, va);
+    }
 
     if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
         (void)map_ldt_shadow_page(0);
@@ -2067,6 +2134,8 @@ long set_gdt(struct exec_domain *ed,
     if ( (pfn = frames[0]) >= max_page )
         goto fail;
 
+    shadow_sync_all(d);
+
     /* The first page is special because Xen owns a range of entries in it. */
     if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
     {
@@ -2146,7 +2215,9 @@ long do_set_gdt(unsigned long *frame_list, unsigned int entries)
 long do_update_descriptor(
     unsigned long pa, unsigned long word1, unsigned long word2)
 {
-    unsigned long pfn = pa >> PAGE_SHIFT;
+    struct domain *dom = current->domain;
+    unsigned long gpfn = pa >> PAGE_SHIFT;
+    unsigned long mfn;
     struct desc_struct *gdt_pent, d;
     struct pfn_info *page;
     struct exec_domain *ed;
@@ -2155,16 +2226,21 @@ long do_update_descriptor(
     d.a = (u32)word1;
     d.b = (u32)word2;
 
-    LOCK_BIGLOCK(current->domain);
+    LOCK_BIGLOCK(dom);
 
-    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
-        UNLOCK_BIGLOCK(current->domain);
+    if ( !(mfn = __gpfn_to_mfn(dom, gpfn)) ) {
+        UNLOCK_BIGLOCK(dom);
         return -EINVAL;
     }
 
-    page = &frame_table[pfn];
-    if ( unlikely(!get_page(page, current->domain)) ) {
-        UNLOCK_BIGLOCK(current->domain);
+    if ( (pa & 7) || (mfn >= max_page) || !check_descriptor(&d) ) {
+        UNLOCK_BIGLOCK(dom);
+        return -EINVAL;
+    }
+
+    page = &frame_table[mfn];
+    if ( unlikely(!get_page(page, dom)) ) {
+        UNLOCK_BIGLOCK(dom);
         return -EINVAL;
     }
 
@@ -2173,8 +2249,8 @@ long do_update_descriptor(
     {
     case PGT_gdt_page:
         /* Disallow updates of Xen-reserved descriptors in the current GDT. */
-        for_each_exec_domain(current->domain, ed) {
-            if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == pfn) &&
+        for_each_exec_domain(dom, ed) {
+            if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == mfn) &&
                  (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
                  (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
                 goto out;
@@ -2192,11 +2268,25 @@ long do_update_descriptor(
         break;
     }
 
+    if ( shadow_mode_enabled(dom) )
+    {
+        shadow_lock(dom);
+
+        if ( shadow_mode_log_dirty(dom) )
+            __mark_dirty(dom, mfn);
+
+        if ( page_is_page_table(page) )
+            shadow_mark_mfn_out_of_sync(current, gpfn, mfn);
+    }
+
     /* All is good so make the update. */
-    gdt_pent = map_domain_mem(pa);
+    gdt_pent = map_domain_mem((mfn << PAGE_SHIFT) | (pa & ~PAGE_MASK));
     memcpy(gdt_pent, &d, 8);
     unmap_domain_mem(gdt_pent);
 
+    if ( shadow_mode_enabled(dom) )
+        shadow_unlock(dom);
+
     put_page_type(page);
 
     ret = 0; /* success */
@@ -2204,7 +2294,7 @@ long do_update_descriptor(
  out:
     put_page(page);
 
-    UNLOCK_BIGLOCK(current->domain);
+    UNLOCK_BIGLOCK(dom);
 
     return ret;
 }
@@ -2229,8 +2319,8 @@ int ptwr_debug = 0x0;
 /* Flush the given writable p.t. page and write-protect it again. */
 void ptwr_flush(const int which)
 {
-    unsigned long  sstat, spte, pte, *ptep, l1va;
-    l1_pgentry_t  *sl1e = NULL, *pl1e, ol1e, nl1e;
+    unsigned long  pte, *ptep, l1va;
+    l1_pgentry_t  *pl1e, ol1e, nl1e;
     l2_pgentry_t  *pl2e;
     int            i, cpu = smp_processor_id();
     struct exec_domain *ed = current;
@@ -2239,6 +2329,9 @@ void ptwr_flush(const int which)
     unsigned int   modified = 0;
 #endif
 
+    // not supported in combination with various shadow modes!
+    ASSERT( !shadow_mode_enabled(d) );
+    
     l1va = ptwr_info[cpu].ptinfo[which].l1va;
     ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)];
 
@@ -2248,7 +2341,7 @@ void ptwr_flush(const int which)
 
     if ( unlikely(__get_user(pte, ptep)) )
     {
-        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
+        MEM_LOG("ptwr: Could not read pte at %p", ptep);
         /*
          * Really a bug. We could read this PTE during the initial fault,
          * and pagetables can't have changed meantime. XXX Multi-CPU guests?
@@ -2259,23 +2352,10 @@ void ptwr_flush(const int which)
                 PTWR_PRINT_WHICH, ptep, pte);
     pte &= ~_PAGE_RW;
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        /* Write-protect the p.t. page in the shadow page table. */
-        l1pte_propagate_from_guest(d, &pte, &spte);
-        __put_user(spte, (unsigned long *)
-                   &shadow_linear_pg_table[l1_linear_offset(l1va)]);
-
-        /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
-        sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
-        if ( sstat & PSH_shadowed )
-            sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
-    }
-
     /* Write-protect the p.t. page in the guest page table. */
     if ( unlikely(__put_user(pte, ptep)) )
     {
-        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
+        MEM_LOG("ptwr: Could not update pte at %p", ptep);
         /*
          * Really a bug. We could write this PTE during the initial fault,
          * and pagetables can't have changed meantime. XXX Multi-CPU guests?
@@ -2318,13 +2398,7 @@ void ptwr_flush(const int which)
         if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
         {
             if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
-            {
-                if ( unlikely(sl1e != NULL) )
-                    l1pte_propagate_from_guest(
-                        d, &l1_pgentry_val(nl1e), 
-                        &l1_pgentry_val(sl1e[i]));
                 put_page_type(&frame_table[l1_pgentry_to_pfn(nl1e)]);
-            }
             continue;
         }
 
@@ -2343,24 +2417,20 @@ void ptwr_flush(const int which)
             domain_crash();
         }
         
-        if ( unlikely(sl1e != NULL) )
-            l1pte_propagate_from_guest(
-                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
-
         if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
             put_page_from_l1e(ol1e, d);
     }
     unmap_domain_mem(pl1e);
-
+    
     perfc_incr_histo(wpt_updates, modified, PT_UPDATES);
 
     /*
      * STEP 3. Reattach the L1 p.t. page into the current address space.
      */
 
-    if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode_enabled(d)) )
+    if ( which == PTWR_PT_ACTIVE )
     {
-        pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
+        pl2e = &linear_l2_table(ed)[ptwr_info[cpu].ptinfo[which].l2_idx];
         *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 
     }
 
@@ -2369,23 +2439,21 @@ void ptwr_flush(const int which)
      */
 
     ptwr_info[cpu].ptinfo[which].l1va = 0;
-
-    if ( unlikely(sl1e != NULL) )
-    {
-        unmap_domain_mem(sl1e);
-        put_shadow_status(d);
-    }
 }
 
 /* Write page fault handler: check if guest is trying to modify a PTE. */
 int ptwr_do_page_fault(unsigned long addr)
 {
+    struct exec_domain *ed = current;
     unsigned long    pte, pfn, l2e;
     struct pfn_info *page;
     l2_pgentry_t    *pl2e;
     int              which, cpu = smp_processor_id();
     u32              l2_idx;
 
+    // not supported in combination with various shadow modes!
+    ASSERT( !shadow_mode_enabled(ed->domain) );
+    
 #ifdef __x86_64__
     return 0; /* Writable pagetables need fixing for x86_64. */
 #endif
@@ -2394,10 +2462,7 @@ int ptwr_do_page_fault(unsigned long addr)
      * Attempt to read the PTE that maps the VA being accessed. By checking for
      * PDE validity in the L2 we avoid many expensive fixups in __get_user().
      */
-    if ( shadow_mode_external(current->domain) )
-        BUG(); // can't use linear_l2_table with external tables.
-
-    if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
+    if ( !(l2_pgentry_val(linear_l2_table(ed)[addr>>L2_PAGETABLE_SHIFT]) &
            _PAGE_PRESENT) ||
          __get_user(pte, (unsigned long *)
                     &linear_pg_table[l1_linear_offset(addr)]) )
@@ -2425,7 +2490,7 @@ int ptwr_do_page_fault(unsigned long addr)
 
     if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
     {
-        MEM_LOG("PTWR failure! Pagetable maps itself at %p\n", addr);
+        MEM_LOG("PTWR failure! Pagetable maps itself at %p", addr);
         domain_crash();
     }
 
@@ -2433,10 +2498,7 @@ int ptwr_do_page_fault(unsigned long addr)
      * Is the L1 p.t. mapped into the current address space? If so we call it
      * an ACTIVE p.t., otherwise it is INACTIVE.
      */
-    if ( shadow_mode_external(current->domain) )
-        BUG(); // can't use linear_l2_table with external tables.
-
-    pl2e = &linear_l2_table[l2_idx];
+    pl2e = &linear_l2_table(ed)[l2_idx];
     l2e  = l2_pgentry_val(*pl2e);
     which = PTWR_PT_INACTIVE;
     if ( (l2e >> PAGE_SHIFT) == pfn )
@@ -2472,8 +2534,7 @@ int ptwr_do_page_fault(unsigned long addr)
     ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
     
     /* For safety, disconnect the L1 p.t. page from current space. */
-    if ( (which == PTWR_PT_ACTIVE) && 
-         likely(!shadow_mode_enabled(current->domain)) )
+    if ( which == PTWR_PT_ACTIVE )
     {
         *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
 #if 1
@@ -2496,7 +2557,7 @@ int ptwr_do_page_fault(unsigned long addr)
     if ( unlikely(__put_user(pte, (unsigned long *)
                              &linear_pg_table[addr>>PAGE_SHIFT])) )
     {
-        MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
+        MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *)
                 &linear_pg_table[addr>>PAGE_SHIFT]);
         /* Toss the writable pagetable state and crash. */
         unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
@@ -2542,7 +2603,7 @@ void ptwr_status(void)
         [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
 
     if ( __get_user(pte, ptep) ) {
-        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
+        MEM_LOG("ptwr: Could not read pte at %p", ptep);
         domain_crash();
     }
 
@@ -2558,7 +2619,7 @@ void ptwr_status(void)
 
     if ( __get_user(pte, (unsigned long *)
                     ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
-        MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
+        MEM_LOG("ptwr: Could not read pte at %p", (unsigned long *)
                 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
         domain_crash();
     }
@@ -2566,433 +2627,6 @@ void ptwr_status(void)
     page = &frame_table[pfn];
 }
 
-void audit_domain(struct domain *d)
-{
-    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
-
-    void adjust (struct pfn_info *page, int dir, int adjtype)
-    {
-        int count = page->count_info & PGC_count_mask;
-
-        if ( adjtype )
-        {
-            int tcount = page->u.inuse.type_info & PGT_count_mask;
-            
-            ttot++;
-
-            tcount += dir;
-
-            if ( tcount < 0 )
-            {
-                /* This will only come out once. */
-                printk("Audit %d: type count whent below zero pfn=%x "
-                       "taf=%x otaf=%x\n",
-                       d->id, page-frame_table,
-                       page->u.inuse.type_info,
-                       page->tlbflush_timestamp);
-            }
-            
-            page->u.inuse.type_info =
-                (page->u.inuse.type_info & ~PGT_count_mask) | 
-                (tcount & PGT_count_mask);
-        }
-
-        ctot++;
-        count += dir;
-        if ( count < 0 )
-        {
-            /* This will only come out once. */
-            printk("Audit %d: general count whent below zero pfn=%x "
-                   "taf=%x otaf=%x\n",
-                   d->id, page-frame_table,
-                   page->u.inuse.type_info,
-                   page->tlbflush_timestamp);
-        }
-            
-        page->count_info =
-            (page->count_info & ~PGC_count_mask) | 
-            (count & PGC_count_mask);            
-
-    }
-
-    void scan_for_pfn(struct domain *d, unsigned long xpfn)
-    {
-        unsigned long pfn, *pt;
-        struct list_head *list_ent;
-        struct pfn_info *page;
-        int i;
-
-        list_ent = d->page_list.next;
-        for ( i = 0; (list_ent != &d->page_list); i++ )
-        {
-            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
-            page = &frame_table[pfn];
-            
-            switch ( page->u.inuse.type_info & PGT_type_mask )
-            {
-            case PGT_l1_page_table:
-            case PGT_l2_page_table:
-                pt = map_domain_mem(pfn<<PAGE_SHIFT);
-                for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-                    if ( (pt[i] & _PAGE_PRESENT) &&
-                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
-                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
-                               d->id, i, pfn, page->u.inuse.type_info,
-                               page->count_info);
-                unmap_domain_mem(pt);           
-            }
-
-            list_ent = frame_table[pfn].list.next;
-        }
-
-    }
-
-    void scan_for_pfn_remote(unsigned long xpfn)
-    {
-        struct domain *e;
-        for_each_domain ( e )
-            scan_for_pfn( e, xpfn );            
-    }   
-
-    int i, l1, l2;
-    unsigned long pfn;
-    struct list_head *list_ent;
-    struct pfn_info *page;
-
-    if ( d != current->domain )
-        domain_pause(d);
-    synchronise_pagetables(~0UL);
-
-    printk("pt base=%lx sh_info=%x\n",
-           pagetable_val(d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT,
-           virt_to_page(d->shared_info)-frame_table);
-           
-    spin_lock(&d->page_alloc_lock);
-
-    /* PHASE 0 */
-
-    list_ent = d->page_list.next;
-    for ( i = 0; (list_ent != &d->page_list); i++ )
-    {
-        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
-        page = &frame_table[pfn];
-
-        if ( page_get_owner(page) != d )
-            BUG();
-
-        if ( (page->u.inuse.type_info & PGT_count_mask) >
-             (page->count_info & PGC_count_mask) )
-            printk("taf > caf %x %x pfn=%lx\n",
-                   page->u.inuse.type_info, page->count_info, pfn );
-#if 0   /* SYSV shared memory pages plus writeable files. */
-        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
-             (page->u.inuse.type_info & PGT_count_mask) > 1 )
-        {
-            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
-                  pfn,
-                  page->u.inuse.type_info,
-                  page->count_info );
-            scan_for_pfn_remote(pfn);
-        }
-#endif
-        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
-             (page->u.inuse.type_info & PGT_count_mask) > 1 )
-        {
-            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
-                  pfn,
-                  page->u.inuse.type_info,
-                  page->count_info );
-        }
-
-        /* Use tlbflush_timestamp to store original type_info. */
-        page->tlbflush_timestamp = page->u.inuse.type_info;
-
-        list_ent = frame_table[pfn].list.next;
-    }
-
-
-    /* PHASE 1 */
-    if ( pagetable_val(d->exec_domain[0]->arch.guest_table) )
-        adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.guest_table)
-                           >>PAGE_SHIFT], -1, 1);
-
-    list_ent = d->page_list.next;
-    for ( i = 0; (list_ent != &d->page_list); i++ )
-    {
-        unsigned long *pt;
-        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
-        page = &frame_table[pfn];
-
-        if ( page_get_owner(page) != d )
-            BUG();
-
-        switch ( page->u.inuse.type_info & PGT_type_mask )
-        {
-        case PGT_l2_page_table:
-
-            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
-                printk("Audit %d: L2 not validated %x\n",
-                       d->id, page->u.inuse.type_info);
-
-            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
-                printk("Audit %d: L2 not pinned %x\n",
-                       d->id, page->u.inuse.type_info);
-            else
-                adjust( page, -1, 1 );
-           
-            pt = map_domain_mem( pfn<<PAGE_SHIFT );
-
-            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
-            {
-                if ( pt[i] & _PAGE_PRESENT )
-                {
-                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
-                    struct pfn_info *l1page = &frame_table[l1pfn];
-
-                    if ( page_get_owner(l1page) != d )
-                    {
-                        printk("L2: Skip bizarre page belonging to other "
-                               "dom %p\n", page_get_owner(l1page));
-                        continue;
-                    }
-                    
-                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
-                         PGT_l2_page_table )
-                        printk("Audit %d: [%x] Found %s Linear PT "
-                               "t=%x pfn=%lx\n", d->id, i, 
-                               (l1pfn==pfn) ? "Self" : "Other",
-                               l1page->u.inuse.type_info,
-                               l1pfn);
-                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
-                              PGT_l1_page_table )
-                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
-                               d->id, i,
-                               l1page->u.inuse.type_info,
-                               l1pfn);
-
-                    adjust(l1page, -1, 1);
-                }
-            }
-
-            unmap_domain_mem(pt);
-
-            break;
-
-
-        case PGT_l1_page_table:
-            
-            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
-                adjust( page, -1, 1 );
-
-            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
-                printk("Audit %d: L1 not validated %x\n",
-                       d->id, page->u.inuse.type_info);
-#if 0
-            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
-                printk("Audit %d: L1 not pinned %x\n",
-                       d->id, page->u.inuse.type_info);
-#endif
-            pt = map_domain_mem( pfn<<PAGE_SHIFT );
-
-            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-            {
-                if ( pt[i] & _PAGE_PRESENT )
-                {
-                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
-                    struct pfn_info *l1page = &frame_table[l1pfn];
-
-                    if ( l1pfn < 0x100 )
-                    {
-                        lowmem_mappings++;
-                        continue;
-                    }
-
-                    if ( l1pfn > max_page )
-                    {
-                        io_mappings++;
-                        continue;
-                    }
-
-                    if ( pt[i] & _PAGE_RW )
-                    {
-
-                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
-                             PGT_l1_page_table ||
-                             (l1page->u.inuse.type_info & PGT_type_mask) ==
-                             PGT_l2_page_table )
-                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
-                                   d->id, i,
-                                   l1page->u.inuse.type_info,
-                                   l1pfn);
-
-                    }
-
-                    if ( page_get_owner(l1page) != d )
-                    {
-                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
-                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
-                               d->id, pfn, i,
-                               page_get_owner(l1page),
-                               l1pfn,
-                               l1page->count_info,
-                               l1page->u.inuse.type_info,
-                               machine_to_phys_mapping[l1pfn]);    
-                        continue;
-                    }
-
-                    adjust(l1page, -1, 0);
-                }
-            }
-
-            unmap_domain_mem(pt);
-
-            break;
-        }       
-
-        list_ent = frame_table[pfn].list.next;
-    }
-
-    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
-        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
-               d->id, lowmem_mappings, io_mappings);
-
-    /* PHASE 2 */
-
-    ctot = ttot = 0;
-    list_ent = d->page_list.next;
-    for ( i = 0; (list_ent != &d->page_list); i++ )
-    {
-        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
-        page = &frame_table[pfn];
-
-        switch ( page->u.inuse.type_info & PGT_type_mask)
-        {
-        case PGT_l1_page_table:
-        case PGT_l2_page_table:
-            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
-            {
-                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
-                       d->id, page->u.inuse.type_info, 
-                       page->tlbflush_timestamp,
-                       page->count_info, pfn );
-                scan_for_pfn_remote(pfn);
-            }
-        default:
-            if ( (page->count_info & PGC_count_mask) != 1 )
-            {
-                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
-                       d->id, 
-                       page->count_info,
-                       page->u.inuse.type_info, 
-                       page->tlbflush_timestamp, pfn );
-                scan_for_pfn_remote(pfn);
-            }
-            break;
-        }
-
-        list_ent = frame_table[pfn].list.next;
-    }
-
-    /* PHASE 3 */
-    list_ent = d->page_list.next;
-    l1 = l2 = 0;
-    for ( i = 0; (list_ent != &d->page_list); i++ )
-    {
-        unsigned long *pt;
-        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
-        page = &frame_table[pfn];
-
-        switch ( page->u.inuse.type_info & PGT_type_mask )
-        {
-        case PGT_l2_page_table:
-           l2++;
-            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
-                adjust( page, 1, 1 );          
-
-            pt = map_domain_mem( pfn<<PAGE_SHIFT );
-
-            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
-            {
-                if ( pt[i] & _PAGE_PRESENT )
-                {
-                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
-                    struct pfn_info *l1page;
-
-                    if (l1pfn>max_page)
-                        continue;
-
-                    l1page = &frame_table[l1pfn];
-
-                    if ( page_get_owner(l1page) == d )
-                        adjust(l1page, 1, 1);
-                }
-            }
-
-            unmap_domain_mem(pt);
-            break;
-
-        case PGT_l1_page_table:
-           l1++;
-            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
-                adjust( page, 1, 1 );
-
-            pt = map_domain_mem( pfn<<PAGE_SHIFT );
-
-            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-            {
-                if ( pt[i] & _PAGE_PRESENT )
-                {
-                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
-                    struct pfn_info *l1page;
-
-                    if (l1pfn>max_page)
-                        continue;
-
-                    l1page = &frame_table[l1pfn];
-
-                    if ( (page_get_owner(l1page) != d) ||
-                         (l1pfn < 0x100) || (l1pfn > max_page) )
-                        continue;
-
-                    adjust(l1page, 1, 0);
-                }
-            }
-
-            unmap_domain_mem(pt);
-            break;
-        }
-
-
-        page->tlbflush_timestamp = 0;
-
-        list_ent = frame_table[pfn].list.next;
-    }
-
-    spin_unlock(&d->page_alloc_lock);
-
-    if ( pagetable_val(d->exec_domain[0]->arch.guest_table) )
-        adjust(&frame_table[pagetable_val(
-            d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT], 1, 1);
-
-    printk("Audit %d: Done. pages=%d l1=%d l2=%d ctot=%d ttot=%d\n", d->id, i, l1, l2, ctot, ttot );
-
-    if ( d != current->domain )
-        domain_unpause(d);
-}
-
-void audit_domains(void)
-{
-    struct domain *d;
-    for_each_domain ( d )
-        audit_domain(d);
-}
-
-void audit_domains_key(unsigned char key)
-{
-    audit_domains();
-}
-
 #endif /* NDEBUG */
 
 /*
index 1ac97c6da6c4af1909a75baaa6192f5b6839291d..b32438f49764ebfa34f5783cfd03f414aa1a0b2b 100644 (file)
@@ -1,3 +1,23 @@
+/******************************************************************************
+ * arch/x86/shadow.c
+ * 
+ * Copyright (c) 2005 Michael A Fetterman
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
 
 #include <xen/config.h>
 #include <xen/types.h>
@@ -8,6 +28,10 @@
 #include <xen/event.h>
 #include <xen/trace.h>
 
+static void shadow_free_snapshot(struct domain *d,
+                                 struct out_of_sync_entry *entry);
+static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
+
 /********
 
 There's a per-domain shadow table spin lock which works fine for SMP
@@ -20,34 +44,401 @@ hypercall lock anyhow (at least initially).
 
 ********/
 
-static inline void free_shadow_page(
-    struct domain *d, struct pfn_info *page)
+static inline int
+shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
+               unsigned long new_type)
 {
-    d->arch.shadow_page_count--;
+    unsigned long min_type, max_type;
+    struct pfn_info *page = pfn_to_page(gmfn);
+    int pinned = 0, okay = 1;
+
+    if ( page_out_of_sync(page) )
+    {
+        // Don't know how long ago this snapshot was taken.
+        // Can't trust it to be recent enough.
+        //
+        __shadow_sync_mfn(d, gmfn);
+    }
+
+    if ( unlikely(mfn_is_page_table(gmfn)) )
+    {
+        min_type = shadow_max_pgtable_type(d, gpfn) + PGT_l1_shadow;
+        max_type = new_type;
+    }
+    else
+    {
+        min_type = PGT_l1_shadow;
+        max_type = PGT_l1_shadow;
+    }
+    FSH_LOG("shadow_promote gpfn=%p gmfn=%p nt=%p min=%p max=%p\n",
+            gmfn, gmfn, new_type, min_type, max_type);
+
+    if ( min_type <= max_type )
+        shadow_remove_all_write_access(d, min_type, max_type, gpfn);
+
+    // To convert this page to use as a page table, the writable count
+    // should now be zero.  Test this by grabbing the page as an page table,
+    // and then immediately releasing.  This will also deal with any
+    // necessary TLB flushing issues for us.
+    //
+    // The cruft here about pinning doesn't really work right.  This
+    // needs rethinking/rewriting...  Need to gracefully deal with the
+    // TLB flushes required when promoting a writable page, and also deal
+    // with any outstanding (external) writable refs to this page (by
+    // refusing to promote it).  The pinning headache complicates this
+    // code -- it would all much get simpler if we stop using
+    // shadow_lock() and move the shadow code to BIGLOCK().
+    //
+    if ( unlikely(!get_page(page, d)) )
+        BUG();
+    if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
+    {
+        pinned = 1;
+        put_page_and_type(page);
+    }
+    if ( get_page_type(page, PGT_base_page_table) )
+    {
+        put_page_type(page);
+        set_bit(_PGC_page_table, &frame_table[gmfn].count_info);
+    }
+    else
+    {
+        printk("shadow_promote: get_page_type failed "
+               "dom%d gpfn=%p gmfn=%p t=%x\n",
+               d->id, gpfn, gmfn, new_type);
+        okay = 0;
+    }
+
+    // Now put the type back to writable...
+    if ( unlikely(!get_page_type(page, PGT_writable_page)) )
+        BUG();
+    if ( unlikely(pinned) )
+    {
+        if ( unlikely(test_and_set_bit(_PGT_pinned,
+                                       &page->u.inuse.type_info)) )
+            BUG(); // hmm... someone pinned this again?
+    }
+    else
+        put_page_and_type(page);
+
+    return okay;
+}
+
+static inline void
+shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
+{
+    ASSERT(frame_table[gmfn].count_info & PGC_page_table);
+
+    if ( shadow_max_pgtable_type(d, gpfn) == PGT_none )
+    {
+        clear_bit(_PGC_page_table, &frame_table[gmfn].count_info);
+
+        if ( page_out_of_sync(pfn_to_page(gmfn)) )
+        {
+            remove_out_of_sync_entries(d, gmfn);
+        }
+    }
+}
+
+/*
+ * Things in shadow mode that collect get_page() refs to the domain's
+ * pages are:
+ * - PGC_allocated takes a gen count, just like normal.
+ * - A writable page can be pinned (paravirtualized guests may consider
+ *   these pages to be L1s or L2s, and don't know the difference).
+ *   Pinning a page takes a gen count (but, for domains in shadow mode,
+ *   it *doesn't* take a type count)
+ * - CR3 grabs a ref to whatever it points at, just like normal.
+ * - Shadow mode grabs an initial gen count for itself, as a placehold
+ *   for whatever references will exist.
+ * - Shadow PTEs that point to a page take a gen count, just like regular
+ *   PTEs.  However, they don't get a type count, as get_page_type() is
+ *   hardwired to keep writable pages' counts at 1 for domains in shadow
+ *   mode.
+ * - Whenever we shadow a page, the entry in the shadow hash grabs a
+ *   general ref to the page.
+ * - Whenever a page goes out of sync, the out of sync entry grabs a
+ *   general ref to the page.
+ */
+/*
+ * pfn_info fields for pages allocated as shadow pages:
+ *
+ * All 32 bits of count_info are a simple count of refs to this shadow
+ * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
+ * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
+ * references.
+ *
+ * u.inuse._domain is left NULL, to prevent accidently allow some random
+ * domain from gaining permissions to map this page.
+ *
+ * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
+ * shadowed.
+ * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
+ * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
+ * is currently exists because this is a shadow of a root page, and we
+ * don't want to let those disappear just because no CR3 is currently pointing
+ * at it.
+ *
+ * tlbflush_timestamp holds a pickled pointer to the domain.
+ */
+
+static inline unsigned long
+alloc_shadow_page(struct domain *d,
+                  unsigned long gpfn, unsigned long gmfn,
+                  u32 psh_type)
+{
+    struct pfn_info *page;
+    unsigned long smfn;
+    int pin = 0;
+
+    if ( (psh_type != PGT_snapshot) &&
+         !shadow_promote(d, gpfn, gmfn, psh_type) )
+    {
+        FSH_LOG("promotion of pfn=%p mfn=%p failed!  external gnttab refs?\n",
+                gpfn, gmfn);
+        return 0;
+    }
+
+    page = alloc_domheap_page(NULL);
+    if ( unlikely(page == NULL) )
+    {
+        printk("Couldn't alloc shadow page! dom%d count=%d\n",
+               d->id, d->arch.shadow_page_count);
+        printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
+               perfc_value(shadow_l1_pages), 
+               perfc_value(shadow_l2_pages),
+               perfc_value(hl2_table_pages),
+               perfc_value(snapshot_pages));
+        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
+    }
+
+    smfn = page_to_pfn(page);
+
+    ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
+    page->u.inuse.type_info = psh_type | gmfn;
+    page->count_info = 0;
+    page->tlbflush_timestamp = pickle_domptr(d);
+
+    switch ( psh_type )
+    {
+    case PGT_l1_shadow:
+        perfc_incr(shadow_l1_pages);
+        d->arch.shadow_page_count++;
+        break;
 
-    switch ( page->u.inuse.type_info & PGT_type_mask )
+    case PGT_l2_shadow:
+        perfc_incr(shadow_l2_pages);
+        d->arch.shadow_page_count++;
+        if ( PGT_l2_page_table == PGT_root_page_table )
+            pin = 1;
+
+        break;
+
+    case PGT_hl2_shadow:
+        perfc_incr(hl2_table_pages);
+        d->arch.hl2_page_count++;
+
+        // treat an hl2 as an L1 for purposes of promotion,
+        // and as an L2 for purposes of pinning.
+        //
+        if ( PGT_l2_page_table == PGT_root_page_table )
+            pin = 1;
+
+        break;
+
+    case PGT_snapshot:
+        perfc_incr(snapshot_pages);
+        d->arch.snapshot_page_count++;
+        break;
+
+    default:
+        printk("Alloc shadow weird page type type=%08x\n", psh_type);
+        BUG();
+        break;
+    }
+
+    set_shadow_status(d, gpfn, smfn, psh_type);
+
+    if ( pin )
+        shadow_pin(smfn);
+
+    return smfn;
+}
+
+static void inline
+free_shadow_l1_table(struct domain *d, unsigned long smfn)
+{
+    l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT);
+    int i;
+
+    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        put_page_from_l1e(pl1e[i], d);
+
+    unmap_domain_mem(pl1e);
+}
+
+static void inline
+free_shadow_hl2_table(struct domain *d, unsigned long smfn)
+{
+    printk("free_shadow_hl2_table(smfn=%p)\n", smfn);
+
+    l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT);
+    int i, limit;
+
+    if ( shadow_mode_external(d) )
+        limit = L2_PAGETABLE_ENTRIES;
+    else
+        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+
+    for ( i = 0; i < limit; i++ )
+        put_page_from_l1e(pl1e[i], d);
+
+    unmap_domain_mem(pl1e);
+}
+
+static void inline
+free_shadow_l2_table(struct domain *d, unsigned long smfn)
+{
+    printk("free_shadow_l2_table(smfn=%p)\n", smfn);
+
+    unsigned long *pl2e = map_domain_mem(smfn << PAGE_SHIFT);
+    int i, external = shadow_mode_external(d);
+
+    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+        if ( external || is_guest_l2_slot(i) )
+            if ( pl2e[i] & _PAGE_PRESENT )
+                put_shadow_ref(pl2e[i] >> PAGE_SHIFT);
+
+    if ( (PGT_base_page_table == PGT_l2_page_table) &&
+         shadow_mode_translate(d) &&
+         !shadow_mode_external(d) )
+    {
+        // free the ref to the hl2
+        //
+        put_shadow_ref(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]
+                       >> PAGE_SHIFT);
+    }
+
+    unmap_domain_mem(pl2e);
+}
+
+void free_shadow_page(unsigned long smfn)
+{
+    struct pfn_info *page = &frame_table[smfn];
+    struct domain *d = unpickle_domptr(page->tlbflush_timestamp);
+    unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
+    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
+    unsigned long type = page->u.inuse.type_info & PGT_type_mask;
+
+    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
+
+    delete_shadow_status(d, gpfn, type);
+
+    switch ( type )
     {
-    case PGT_l1_page_table:
+    case PGT_l1_shadow:
         perfc_decr(shadow_l1_pages);
+        shadow_demote(d, gpfn, gmfn);
+        free_shadow_l1_table(d, smfn);
         break;
 
-    case PGT_l2_page_table:
+    case PGT_l2_shadow:
         perfc_decr(shadow_l2_pages);
+        shadow_demote(d, gpfn, gmfn);
+        free_shadow_l2_table(d, smfn);
+        break;
+
+    case PGT_hl2_shadow:
+        perfc_decr(hl2_table_pages);
+        shadow_demote(d, gpfn, gmfn);
+        free_shadow_hl2_table(d, smfn);
+        break;
+
+    case PGT_snapshot:
+        perfc_decr(snapshot_pages);
         break;
 
     default:
-        printk("Free shadow weird page type pfn=%08x type=%08x\n",
-               frame_table-page, page->u.inuse.type_info);
+        printk("Free shadow weird page type mfn=%08x type=%08x\n",
+               page-frame_table, page->u.inuse.type_info);
         break;
     }
 
+    d->arch.shadow_page_count--;
+
+    // No TLB flushes are needed the next time this page gets allocated.
+    //
+    page->tlbflush_timestamp = 0;
+    page->u.free.cpu_mask = 0;
+
     free_domheap_page(page);
 }
 
-void free_shadow_state(struct domain *d)
+static void inline
+release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
+{
+    struct pfn_info *page;
+
+    page = &frame_table[entry->gmfn];
+        
+    // Decrement ref count of guest & shadow pages
+    //
+    put_page(page);
+
+    // Only use entries that have low bits clear...
+    //
+    if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
+        put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
+
+    // Free the snapshot
+    //
+    shadow_free_snapshot(d, entry);
+}
+
+static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
+{
+    struct out_of_sync_entry *entry = d->arch.out_of_sync;
+    struct out_of_sync_entry **prev = &d->arch.out_of_sync;
+
+    while ( entry )
+    {
+        if ( entry->gmfn == gmfn )
+        {
+            release_out_of_sync_entry(d, entry);
+            *prev = entry = entry->next;
+            continue;
+        }
+        prev = &entry->next;
+        entry = entry->next;
+    }
+}
+
+static void free_out_of_sync_state(struct domain *d)
+{
+    struct out_of_sync_entry *entry;
+    struct out_of_sync_entry **tail = NULL;
+
+    // Add the list of out-of-sync entries to the free list of entries.
+    // Not the smartest code.  But it works.
+    //
+    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
+    {
+        release_out_of_sync_entry(d, entry);
+        tail = &entry->next;
+    }
+    if ( tail )
+    {
+        *tail = d->arch.out_of_sync_free;
+        d->arch.out_of_sync_free = d->arch.out_of_sync;
+        d->arch.out_of_sync = NULL;
+    }
+}
+
+static void free_shadow_pages(struct domain *d)
 {
     int                   i, free = 0;
     struct shadow_status *x, *n;
+    struct exec_domain   *e;
  
     /*
      * WARNING! The shadow page table must not currently be in use!
@@ -58,21 +449,37 @@ void free_shadow_state(struct domain *d)
 
     if( !d->arch.shadow_ht ) return;
 
-    /* Free each hash chain in turn. */
+    // first, remove any outstanding refs from out_of_sync entries...
+    //
+    free_out_of_sync_state(d);
+
+    // second, remove any outstanding refs from ed->arch.shadow_table...
+    //
+    for_each_exec_domain(d, e)
+    {
+        if ( pagetable_val(e->arch.shadow_table) )
+        {
+            put_shadow_ref(pagetable_val(e->arch.shadow_table) >> PAGE_SHIFT);
+            e->arch.shadow_table = mk_pagetable(0);
+        }
+    }
+
+    // Now, the only refs to shadow pages that are left are from the shadow
+    // pages themselves.  We can just free them.
+    //
     for ( i = 0; i < shadow_ht_buckets; i++ )
     {
         /* Skip empty buckets. */
         x = &d->arch.shadow_ht[i];
-        if ( x->pfn == 0 )
+        if ( x->gpfn_and_flags == 0 )
             continue;
 
         /* Free the head page. */
-        free_shadow_page(
-            d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]);
+        free_shadow_page(x->smfn);
 
         /* Reinitialise the head node. */
-        x->pfn            = 0;
-        x->smfn_and_flags = 0;
+        x->gpfn_and_flags = 0;
+        x->smfn           = 0;
         n                 = x->next;
         x->next           = NULL;
 
@@ -82,16 +489,15 @@ void free_shadow_state(struct domain *d)
         for ( x = n; x != NULL; x = n )
         { 
             /* Free the shadow page. */
-            free_shadow_page(
-                d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]);
+            free_shadow_page(x->smfn);
 
             /* Re-initialise the chain node. */
-            x->pfn            = 0;
-            x->smfn_and_flags = 0;
+            x->gpfn_and_flags = 0;
+            x->smfn           = 0;
 
             /* Add to the free list. */
-            n                 = x->next;
-            x->next           = d->arch.shadow_ht_free;
+            n       = x->next;
+            x->next = d->arch.shadow_ht_free;
             d->arch.shadow_ht_free = x;
 
             free++;
@@ -103,80 +509,140 @@ void free_shadow_state(struct domain *d)
     SH_LOG("Free shadow table. Freed=%d.", free);
 }
 
-static inline int clear_shadow_page(
-    struct domain *d, struct shadow_status *x)
+void shadow_mode_init(void)
 {
-    unsigned long   *p;
-    int              restart = 0;
-    struct pfn_info *spage = &frame_table[x->smfn_and_flags & PSH_pfn_mask];
+}
 
-    // We don't clear hl2_table's here.  At least not yet.
-    if ( x->pfn & PSH_hl2 )
-        return 0;
+static void alloc_monitor_pagetable(struct exec_domain *ed)
+{
+    unsigned long mmfn;
+    l2_pgentry_t *mpl2e;
+    struct pfn_info *mmfn_info;
+    struct domain *d = ed->domain;
 
-    switch ( spage->u.inuse.type_info & PGT_type_mask )
-    {
-        /* We clear L2 pages by zeroing the guest entries. */
-    case PGT_l2_page_table:
-        p = map_domain_mem((spage - frame_table) << PAGE_SHIFT);
-        if ( shadow_mode_external(d) )
-            memset(p, 0, L2_PAGETABLE_ENTRIES * sizeof(*p));
-        else 
-            memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
-        unmap_domain_mem(p);
-        break;
+    ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */
 
-        /* We clear L1 pages by freeing them: no benefit from zeroing them. */
-    case PGT_l1_page_table:
-        delete_shadow_status(d, x->pfn);
-        free_shadow_page(d, spage);
-        restart = 1; /* We need to go to start of list again. */
-        break;
-    }
+    mmfn_info = alloc_domheap_page(NULL);
+    ASSERT( mmfn_info ); 
+
+    mmfn = (unsigned long) (mmfn_info - frame_table);
+    mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT);
+    memset(mpl2e, 0, PAGE_SIZE);
+
+    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+
+    mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
+        mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) 
+                      | __PAGE_HYPERVISOR);
 
-    return restart;
+    // map the phys_to_machine map into the Read-Only MPT space for this domain
+    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
+        mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR);
+
+    ed->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT);
+    ed->arch.monitor_vtable = mpl2e;
 }
 
-static void clear_shadow_state(struct domain *d)
+/*
+ * Free the pages for monitor_table and hl2_table
+ */
+void free_monitor_pagetable(struct exec_domain *ed)
 {
-    int                   i;
-    struct shadow_status *x;
-    shadow_audit(d, 1);
+    l2_pgentry_t *mpl2e, hl2e;
+    unsigned long mfn;
 
-    for ( i = 0; i < shadow_ht_buckets; i++ )
-    {
-    retry:
-        /* Skip empty buckets. */
-        x = &d->arch.shadow_ht[i];
-        if ( x->pfn == 0 )
-            continue;
+    ASSERT( pagetable_val(ed->arch.monitor_table) );
+    ASSERT( shadow_mode_external(ed->domain) );
+    
+    mpl2e = ed->arch.monitor_vtable;
 
-        if ( clear_shadow_page(d, x) )
-            goto retry;
+    /*
+     * First get the mfn for hl2_table by looking at monitor_table
+     */
+    hl2e = mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT];
+    ASSERT(l2_pgentry_val(hl2e) & _PAGE_PRESENT);
+    mfn = l2_pgentry_val(hl2e) >> PAGE_SHIFT;
+    ASSERT(mfn);
 
-        for ( x = x->next; x != NULL; x = x->next )
-            if ( clear_shadow_page(d, x) )
-                goto retry;
+    put_shadow_ref(mfn);
+    unmap_domain_mem(mpl2e);
 
-        shadow_audit(d, 0);
-    }
+    /*
+     * Then free monitor_table.
+     */
+    mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT;
+    free_domheap_page(&frame_table[mfn]);
 
-    SH_VLOG("Scan shadow table. l1=%d l2=%d",
-            perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
+    ed->arch.monitor_table = mk_pagetable(0);
+    ed->arch.monitor_vtable = 0;
 }
 
-
-void shadow_mode_init(void)
+int __shadow_mode_enable(struct domain *d, unsigned int mode)
 {
-}
+    struct exec_domain *ed;
 
+    for_each_exec_domain(d, ed)
+    {
+        invalidate_shadow_ldt(ed);
 
-int __shadow_mode_enable(struct domain *d, unsigned int mode)
-{
-    d->arch.shadow_mode = mode;
+        // We need to set these up for __update_pagetables().
+        // See the comment there.
+
+        /*
+         * arch.guest_vtable
+         */
+        if ( ed->arch.guest_vtable &&
+             (ed->arch.guest_vtable != __linear_l2_table) )
+        {
+            unmap_domain_mem(ed->arch.guest_vtable);
+        }
+        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
+            ed->arch.guest_vtable = __linear_l2_table;
+        else
+            ed->arch.guest_vtable = NULL;
+
+        /*
+         * arch.shadow_vtable
+         */
+        if ( ed->arch.shadow_vtable &&
+             (ed->arch.shadow_vtable != __shadow_linear_l2_table) )
+        {
+            unmap_domain_mem(ed->arch.shadow_vtable);
+        }
+        if ( !(mode & SHM_external) )
+            ed->arch.shadow_vtable = __shadow_linear_l2_table;
+        else
+            ed->arch.shadow_vtable = NULL;
+
+        /*
+         * arch.hl2_vtable
+         */
+        if ( ed->arch.hl2_vtable &&
+             (ed->arch.hl2_vtable != __linear_hl2_table) )
+        {
+            unmap_domain_mem(ed->arch.hl2_vtable);
+        }
+        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
+            ed->arch.hl2_vtable = __linear_hl2_table;
+        else
+            ed->arch.hl2_vtable = NULL;
+
+        /*
+         * arch.monitor_table & arch.monitor_vtable
+         */
+        if ( ed->arch.monitor_vtable )
+        {
+            free_monitor_pagetable(ed);
+        }
+        if ( mode & SHM_external )
+        {
+            alloc_monitor_pagetable(ed);
+        }
+    }
 
-    if (!d->arch.shadow_ht)
+    if ( !d->arch.shadow_ht )
     {
         d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
         if ( d->arch.shadow_ht == NULL )
@@ -186,7 +652,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
            shadow_ht_buckets * sizeof(struct shadow_status));
     }
 
-    if ( shadow_mode_log_dirty(d) && !d->arch.shadow_dirty_bitmap)
+    if ( shadow_mode_log_dirty(d) && !d->arch.shadow_dirty_bitmap )
     {
         d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63;
         d->arch.shadow_dirty_bitmap = 
@@ -201,6 +667,63 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode)
                d->arch.shadow_dirty_bitmap_size/8);
     }
 
+    printk("audit1\n");
+    _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK, __FILE__, __LINE__);
+    printk("audit1 done\n");
+
+    // Get rid of any shadow pages from any previous shadow mode.
+    //
+    free_shadow_pages(d);
+
+    printk("audit2\n");
+    _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK, __FILE__, __LINE__);
+    printk("audit2 done\n");
+
+    // Turn off writable page tables.
+    // It doesn't mix with shadow mode.
+    //
+    vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables);
+
+    /*
+     * Tear down it's counts by disassembling its page-table-based ref counts.
+     * Also remove CR3's gcount/tcount.
+     * That leaves things like GDTs and LDTs and external refs in tact.
+     *
+     * Most pages will be writable tcount=0.
+     * Some will still be L1 tcount=0 or L2 tcount=0.
+     * Maybe some pages will be type none tcount=0.
+     * Pages granted external writable refs (via grant tables?) will
+     * still have a non-zero tcount.  That's OK.
+     *
+     * gcounts will generally be 1 for PGC_allocated.
+     * GDTs and LDTs will have additional gcounts.
+     * Any grant-table based refs will still be in the gcount.
+     *
+     * We attempt to grab writable refs to each page (thus setting its type).
+     * Immediately put back those type refs.
+     *
+     * Assert that no pages are left with L1/L2/L3/L4 type.
+     */
+    audit_adjust_pgtables(d, -1, 1);
+    d->arch.shadow_mode = mode;
+
+    struct list_head *list_ent = d->page_list.next;
+    while ( list_ent != &d->page_list )
+    {
+        struct pfn_info *page = list_entry(list_ent, struct pfn_info, list);
+        if ( !get_page_type(page, PGT_writable_page) )
+            BUG();
+        put_page_type(page);
+
+        list_ent = page->list.next;
+    }
+
+    audit_adjust_pgtables(d, 1, 1);
+
+    printk("audit3\n");
+    _audit_domain(d, AUDIT_ALREADY_LOCKED, __FILE__, __LINE__);
+    printk("audit3 done\n");
+
     return 0;
 
  nomem:
@@ -219,13 +742,10 @@ int shadow_mode_enable(struct domain *d, unsigned int mode)
     return rc;
 }
 
-void __shadow_mode_disable(struct domain *d)
+static void free_shadow_ht_entries(struct domain *d)
 {
     struct shadow_status *x, *n;
 
-    free_shadow_state(d);
-    d->arch.shadow_mode = 0;
-
     SH_VLOG("freed tables count=%d l1=%d l2=%d",
             d->arch.shadow_page_count, perfc_value(shadow_l1_pages), 
             perfc_value(shadow_l2_pages));
@@ -239,6 +759,8 @@ void __shadow_mode_disable(struct domain *d)
     }
 
     d->arch.shadow_ht_extras = NULL;
+    d->arch.shadow_ht_free = NULL;
+
     ASSERT(d->arch.shadow_extras_count == 0);
     SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
 
@@ -253,6 +775,45 @@ void __shadow_mode_disable(struct domain *d)
     d->arch.shadow_ht = NULL;
 }
 
+static void free_out_of_sync_entries(struct domain *d)
+{
+    struct out_of_sync_entry *x, *n;
+
+    n = d->arch.out_of_sync_extras;
+    while ( (x = n) != NULL )
+    {
+        d->arch.out_of_sync_extras_count--;
+        n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
+        xfree(x);
+    }
+
+    d->arch.out_of_sync_extras = NULL;
+    d->arch.out_of_sync_free = NULL;
+    d->arch.out_of_sync = NULL;
+
+    ASSERT(d->arch.out_of_sync_extras_count == 0);
+    FSH_LOG("freed extra out_of_sync entries, now %d",
+            d->arch.out_of_sync_extras_count);
+}
+
+void __shadow_mode_disable(struct domain *d)
+{
+    // This needs rethinking for the full shadow mode stuff.
+    //
+    // Among other things, ref counts need to be restored to a sensible
+    // state for a non-shadow-mode guest...
+    // This is probably easiest to do by stealing code from audit_domain().
+    //
+    BUG();
+
+    free_shadow_pages(d);
+    
+    d->arch.shadow_mode = 0;
+
+    free_shadow_ht_entries(d);
+    free_out_of_sync_entries(d);
+}
+
 static int shadow_mode_table_op(
     struct domain *d, dom0_shadow_control_t *sc)
 {
@@ -272,7 +833,7 @@ static int shadow_mode_table_op(
     switch ( op )
     {
     case DOM0_SHADOW_CONTROL_OP_FLUSH:
-        free_shadow_state(d);
+        free_shadow_pages(d);
 
         d->arch.shadow_fault_count       = 0;
         d->arch.shadow_dirty_count       = 0;
@@ -282,7 +843,7 @@ static int shadow_mode_table_op(
         break;
    
     case DOM0_SHADOW_CONTROL_OP_CLEAN:
-        clear_shadow_state(d);
+        free_shadow_pages(d);
 
         sc->stats.fault_count       = d->arch.shadow_fault_count;
         sc->stats.dirty_count       = d->arch.shadow_dirty_count;
@@ -394,13 +955,13 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
         break;
 
     case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
-        free_shadow_state(d);
+        free_shadow_pages(d);
         rc = __shadow_mode_enable(d, SHM_enable);
         break;
 
     case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
-        free_shadow_state(d);
-        rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_log_dirty);
+        free_shadow_pages(d);
+        rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
         break;
 
     default:
@@ -418,249 +979,828 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
     return rc;
 }
 
-static inline struct pfn_info *alloc_shadow_page(struct domain *d)
+/*
+ * XXX KAF: Why is this VMX specific?
+ */
+void vmx_shadow_clear_state(struct domain *d)
+{
+    SH_VVLOG("vmx_clear_shadow_state:");
+    shadow_lock(d);
+    free_shadow_pages(d);
+    shadow_unlock(d);
+}
+
+static unsigned long
+shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
+                unsigned long smfn)
 {
-    struct pfn_info *page = alloc_domheap_page(NULL);
+    unsigned long hl2mfn;
+    l1_pgentry_t *hl2;
+    l2_pgentry_t *gl2;
+    int i, limit;
 
-    d->arch.shadow_page_count++;
+    ASSERT(PGT_base_page_table == PGT_l2_page_table);
 
-    if ( unlikely(page == NULL) )
+    if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
+    {
+        printk("Couldn't alloc an HL2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
+        BUG(); /* XXX Deal gracefully with failure. */
+    }
+
+    perfc_incrc(shadow_hl2_table_count);
+
+    ASSERT( pagetable_val(current->arch.guest_table) == (gmfn << PAGE_SHIFT) );
+    gl2 = current->arch.guest_vtable;
+
+    hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
+
+    if ( shadow_mode_external(d) )
+        limit = L2_PAGETABLE_ENTRIES;
+    else
+        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+
+    for ( i = 0; i < limit; i++ )
+    {
+        unsigned long gl2e = l2_pgentry_val(gl2[i]);
+        unsigned long mfn;
+
+        if ( gl2e & _PAGE_PRESENT )
+        {
+            mfn = __gpfn_to_mfn(d, gl2e >> PAGE_SHIFT);
+            hl2[i] = mk_l1_pgentry((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+            get_page(pfn_to_page(mfn), d);
+        }
+        else
+            hl2[i] = mk_l1_pgentry(0);
+    }
+
+    if ( !shadow_mode_external(d) )
+    {
+        memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
+               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+
+        // Setup easy access to the GL2, SL2, and HL2 frames.
+        //
+        hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
+            mk_l1_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+        hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+            mk_l1_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+        hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
+            mk_l1_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+    }
+
+    unmap_domain_mem(hl2);
+
+    return hl2mfn;
+}
+
+/*
+ * This could take and use a snapshot, and validate the entire page at
+ * once, or it could continue to fault in entries one at a time...
+ * Might be worth investigating...
+ */
+static unsigned long shadow_l2_table(
+    struct domain *d, unsigned long gpfn, unsigned long gmfn)
+{
+    unsigned long smfn;
+    l2_pgentry_t *spl2e;
+
+    SH_VVLOG("shadow_l2_table(gpfn=%p, gmfn=%p)", gpfn, gmfn);
+
+    perfc_incrc(shadow_l2_table_count);
+
+    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
+    {
+        printk("Couldn't alloc an L2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn);
+        BUG(); /* XXX Deal gracefully with failure. */
+    }
+
+    spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
+
+    /* Install hypervisor and 2x linear p.t. mapings. */
+    if ( (PGT_base_page_table == PGT_l2_page_table) &&
+         !shadow_mode_external(d) )
+    {
+        /*
+         * We could proactively fill in PDEs for pages that are already
+         * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
+         * (restriction required for coherence of the accessed bit). However,
+         * we tried it and it didn't help performance. This is simpler. 
+         */
+        memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
+
+        /* Install hypervisor and 2x linear p.t. mapings. */
+        memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+               &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+
+        if ( shadow_mode_translate(d) ) // NB: not external
+        {
+            unsigned long hl2mfn;
+            if ( unlikely(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow)) )
+                hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
+
+            // shadow_mode_translate (but not external) sl2 tables hold a
+            // ref to their hl2.
+            //
+            get_shadow_ref(hl2mfn);
+            
+            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
+                mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+        }
+        else
+            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
+                mk_l2_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+
+        spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+            mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+
+        spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] =
+            mk_l2_pgentry(__pa(page_get_owner(
+                &frame_table[gmfn])->arch.mm_perdomain_pt) |
+                          __PAGE_HYPERVISOR);
+    }
+    else
+    {
+        memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));        
+    }
+
+    unmap_domain_mem(spl2e);
+
+    SH_VLOG("shadow_l2_table(%p -> %p)", gmfn, smfn);
+    return smfn;
+}
+
+void shadow_map_l1_into_current_l2(unsigned long va)
+{ 
+    struct exec_domain *ed = current;
+    struct domain *d = ed->domain;
+    unsigned long    *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, gl1mfn, sl1mfn;
+    int i, init_table = 0;
+
+    __guest_get_l2e(ed, va, &gl2e);
+    ASSERT(gl2e & _PAGE_PRESENT);
+    gl1pfn = gl2e >> PAGE_SHIFT;
+
+    if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
     {
-        printk("Couldn't alloc shadow page! count=%d\n",
-               d->arch.shadow_page_count);
-        SH_VLOG("Shadow tables l1=%d l2=%d",
-                perfc_value(shadow_l1_pages), 
-                perfc_value(shadow_l2_pages));
+        /* This L1 is NOT already shadowed so we need to shadow it. */
+        SH_VVLOG("4a: l1 not shadowed");
+
+        gl1mfn = __gpfn_to_mfn(d, gl1pfn);
+        if ( unlikely(!gl1mfn) )
+        {
+            // Attempt to use an invalid pfn as an L1 page.
+            // XXX this needs to be more graceful!
+            BUG();
+        }
+
+        if ( unlikely(!(sl1mfn =
+                        alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
+        {
+            printk("Couldn't alloc an L1 shadow for pfn=%p mfn=%p\n",
+                   gl1pfn, gl1mfn);
+            BUG(); /* XXX Need to deal gracefully with failure. */
+        }
+
+        perfc_incrc(shadow_l1_table_count);
+        init_table = 1;
+    }
+    else
+    {
+        /* This L1 is shadowed already, but the L2 entry is missing. */
+        SH_VVLOG("4b: was shadowed, l2 missing (%p)", sl1mfn);
+    }
+
+#ifndef NDEBUG
+    unsigned long old_sl2e;
+    __shadow_get_l2e(ed, va, &old_sl2e);
+    ASSERT( !(old_sl2e & _PAGE_PRESENT) );
+#endif
+
+    get_shadow_ref(sl1mfn);
+    l2pde_general(d, &gl2e, &sl2e, sl1mfn);
+    __guest_set_l2e(ed, va, gl2e);
+    __shadow_set_l2e(ed, va, sl2e);
+
+    if ( init_table )
+    {
+        gpl1e = (unsigned long *)
+            &(linear_pg_table[l1_linear_offset(va) &
+                              ~(L1_PAGETABLE_ENTRIES-1)]);
+
+        spl1e = (unsigned long *)
+            &(shadow_linear_pg_table[l1_linear_offset(va) &
+                                     ~(L1_PAGETABLE_ENTRIES-1)]);
+
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            l1pte_propagate_from_guest(d, gpl1e[i], &spl1e[i]);
+            if ( spl1e[i] & _PAGE_PRESENT )
+                get_page_from_l1e(mk_l1_pgentry(spl1e[i]), d);
+        }
+    }
+}
+
+void shadow_invlpg(struct exec_domain *ed, unsigned long va)
+{
+    struct domain *d = ed->domain;
+    unsigned long gpte, spte;
+
+    ASSERT(shadow_mode_enabled(d));
+
+    shadow_lock(d);
+
+    __shadow_sync_va(ed, va);
+
+    // XXX mafetter: will need to think about 4MB pages...
+
+    // It's not strictly necessary to update the shadow here,
+    // but it might save a fault later.
+    //
+    if (__get_user(gpte, (unsigned long *)
+                   &linear_pg_table[va >> PAGE_SHIFT])) {
+        perfc_incrc(shadow_invlpg_faults);
+        return;
+    }
+    l1pte_propagate_from_guest(d, gpte, &spte);
+    shadow_set_l1e(va, spte, 1);
+
+    shadow_unlock(d);
+}
+
+struct out_of_sync_entry *
+shadow_alloc_oos_entry(struct domain *d)
+{
+    struct out_of_sync_entry *f, *extra;
+    unsigned size, i;
+
+    if ( unlikely(d->arch.out_of_sync_free == NULL) )
+    {
+        FSH_LOG("Allocate more fullshadow tuple blocks.");
+
+        size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
+        extra = xmalloc_bytes(size);
+
+        /* XXX Should be more graceful here. */
+        if ( extra == NULL )
+            BUG();
+
+        memset(extra, 0, size);
+
+        /* Record the allocation block so it can be correctly freed later. */
+        d->arch.out_of_sync_extras_count++;
+        *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) = 
+            d->arch.out_of_sync_extras;
+        d->arch.out_of_sync_extras = &extra[0];
+
+        /* Thread a free chain through the newly-allocated nodes. */
+        for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
+            extra[i].next = &extra[i+1];
+        extra[i].next = NULL;
+
+        /* Add the new nodes to the free list. */
+        d->arch.out_of_sync_free = &extra[0];
+    }
+
+    /* Allocate a new node from the quicklist. */
+    f = d->arch.out_of_sync_free;
+    d->arch.out_of_sync_free = f->next;
+
+    return f;
+}
+
+static unsigned long
+shadow_make_snapshot(
+    struct domain *d, unsigned long gpfn, unsigned long gmfn)
+{
+    unsigned long smfn;
+    void *original, *snapshot;
+
+    if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) )
+    {
+        ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
+        return SHADOW_SNAPSHOT_ELSEWHERE;
+    }
+
+    perfc_incrc(shadow_make_snapshot);
+
+    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
+    {
+        printk("Couldn't alloc fullshadow snapshot for pfn=%p mfn=%p!\n"
+               "Dom%d snapshot_count_count=%d\n",
+               gpfn, gmfn, d->id, d->arch.snapshot_page_count);
         BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
     }
 
-    return page;
+    get_shadow_ref(smfn);
+
+    original = map_domain_mem(gmfn << PAGE_SHIFT);
+    snapshot = map_domain_mem(smfn << PAGE_SHIFT);
+    memcpy(snapshot, original, PAGE_SIZE);
+    unmap_domain_mem(original);
+    unmap_domain_mem(snapshot);
+
+    return smfn;
+}
+
+static void
+shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
+{
+    void *snapshot;
+
+    if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
+        return;
+
+    // Clear the out_of_sync bit.
+    //
+    clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info);
+
+    // XXX Need to think about how to protect the domain's
+    // information less expensively.
+    //
+    snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
+    memset(snapshot, 0, PAGE_SIZE);
+    unmap_domain_mem(snapshot);
+
+    put_shadow_ref(entry->snapshot_mfn);
+}
+
+struct out_of_sync_entry *
+shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn,
+                             unsigned long mfn)
+{
+    struct domain *d = ed->domain;
+    struct pfn_info *page = &frame_table[mfn];
+    struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
+
+    ASSERT(spin_is_locked(&d->arch.shadow_lock));
+    ASSERT(pfn_is_ram(mfn));
+    //ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page);
+    if (!((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page))
+    {
+        printk("assertion failed: gpfn=%p gmfn=%p t=%p\n",
+               gpfn, mfn, page->u.inuse.type_info);
+        BUG();
+    }
+
+    FSH_LOG("mark_mfn_out_of_sync(gpfn=%p, mfn=%p) c=%p t=%p",
+            gpfn, mfn, page->count_info, page->u.inuse.type_info);
+
+    // XXX this will require some more thought...  Cross-domain sharing and
+    //     modification of page tables?  Hmm...
+    //
+    if ( d != page_get_owner(page) )
+        BUG();
+
+    perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
+
+    entry->gpfn = gpfn;
+    entry->gmfn = mfn;
+    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
+    entry->writable_pl1e = -1;
+
+    // increment guest's ref count to represent the entry in the
+    // full shadow out-of-sync list.
+    //
+    get_page(page, d);
+
+    // Add to the out-of-sync list
+    //
+    entry->next = d->arch.out_of_sync;
+    d->arch.out_of_sync = entry;
+
+    return entry;
+}
+
+void shadow_mark_out_of_sync(
+    struct exec_domain *ed, unsigned long gpfn, unsigned long mfn, unsigned long va)
+{
+    struct out_of_sync_entry *entry =
+        shadow_mark_mfn_out_of_sync(ed, gpfn, mfn);
+    unsigned long sl2e;
+
+    // We need the address of shadow PTE that maps @va.
+    // It might not exist yet.  Make sure it's there.
+    //
+    __shadow_get_l2e(ed, va, &sl2e);
+    if ( !(sl2e & _PAGE_PRESENT) )
+    {
+        // either this L1 isn't shadowed yet, or the shadow isn't linked into
+        // the current L2.
+        shadow_map_l1_into_current_l2(va);
+        __shadow_get_l2e(ed, va, &sl2e);
+    }
+    ASSERT(sl2e & _PAGE_PRESENT);
+
+    // NB: this is stored as a machine address.
+    entry->writable_pl1e =
+        ((sl2e & PAGE_MASK) |
+         (sizeof(l1_pgentry_t) * l1_table_offset(va)));
+    ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
+
+    // Increment shadow's page count to represent the reference
+    // inherent in entry->writable_pl1e
+    //
+    get_shadow_ref(sl2e >> PAGE_SHIFT);
+
+    FSH_LOG("mark_out_of_sync(va=%p -> writable_pl1e=%p)",
+            va, entry->writable_pl1e);
+}
+
+/*
+ * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
+ * Returns 0 otherwise.
+ */
+static int snapshot_entry_matches(
+    struct exec_domain *ed, unsigned long gmfn, unsigned index)
+{
+    unsigned long gpfn = __mfn_to_gpfn(ed->domain, gmfn);
+    unsigned long smfn = __shadow_status(ed->domain, gpfn, PGT_snapshot);
+    unsigned long *guest, *snapshot;
+    int compare;
+
+    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
+
+    perfc_incrc(snapshot_entry_matches_calls);
+
+    if ( !smfn )
+        return 0;
+
+    guest    = map_domain_mem(gmfn << PAGE_SHIFT);
+    snapshot = map_domain_mem(smfn << PAGE_SHIFT);
+
+    // This could probably be smarter, but this is sufficent for
+    // our current needs.
+    //
+    compare = (guest[index] == snapshot[index]);
+
+    unmap_domain_mem(guest);
+    unmap_domain_mem(snapshot);
+
+#ifdef PERF_COUNTERS
+    if ( compare )
+        perfc_incrc(snapshot_entry_matches_true);
+#endif
+
+    return compare;
+}
+
+/*
+ * Returns 1 if va's shadow mapping is out-of-sync.
+ * Returns 0 otherwise.
+ */
+int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va)
+{
+    struct domain *d = ed->domain;
+    unsigned long l2mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
+    unsigned long l2e;
+    unsigned long l1mfn;
+
+    ASSERT(spin_is_locked(&d->arch.shadow_lock));
+
+    perfc_incrc(shadow_out_of_sync_calls);
+
+    if ( page_out_of_sync(&frame_table[l2mfn]) &&
+         !snapshot_entry_matches(ed, l2mfn, l2_table_offset(va)) )
+        return 1;
+
+    __guest_get_l2e(ed, va, &l2e);
+    if ( !(l2e & _PAGE_PRESENT) )
+        return 0;
+
+    l1mfn = __gpfn_to_mfn(d, l2e >> PAGE_SHIFT);
+
+    // If the l1 pfn is invalid, it can't be out of sync...
+    if ( !l1mfn )
+        return 0;
+
+    if ( page_out_of_sync(&frame_table[l1mfn]) &&
+         !snapshot_entry_matches(ed, l1mfn, l1_table_offset(va)) )
+        return 1;
+
+    return 0;
+}
+
+static u32 remove_all_write_access_in_ptpage(
+    struct domain *d, unsigned long pt_mfn, unsigned long readonly_mfn)
+{
+    unsigned long *pt = map_domain_mem(pt_mfn << PAGE_SHIFT);
+    unsigned long match =
+        (readonly_mfn << PAGE_SHIFT) | _PAGE_RW | _PAGE_PRESENT;
+    unsigned long mask = PAGE_MASK | _PAGE_RW | _PAGE_PRESENT;
+    int i;
+    u32 count = 0;
+    int is_l1_shadow =
+        ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) ==
+         PGT_l1_shadow);
+
+    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
+    {
+        if ( unlikely(((pt[i] ^ match) & mask) == 0) )
+        {
+            unsigned long old = pt[i];
+            unsigned long new = old & ~_PAGE_RW;
+
+            if ( is_l1_shadow )
+                get_page_from_l1e(mk_l1_pgentry(new), d);
+
+            count++;
+            pt[i] = new;
+
+            if ( is_l1_shadow )
+                put_page_from_l1e(mk_l1_pgentry(old), d);
+
+            FSH_LOG("removed write access to mfn=%p in smfn=%p entry %x "
+                    "is_l1_shadow=%d\n",
+                    readonly_mfn, pt_mfn, i, is_l1_shadow);
+        }
+    }
+
+    unmap_domain_mem(pt);
+
+    return count;
 }
 
-void unshadow_table(unsigned long gpfn, unsigned int type)
+u32 shadow_remove_all_write_access(
+    struct domain *d, unsigned min_type, unsigned max_type, unsigned long gpfn)
 {
-    unsigned long  smfn;
-    struct domain *d = page_get_owner(&frame_table[gpfn]);
-
-    SH_VLOG("unshadow_table type=%08x gpfn=%p", type, gpfn);
+    int i;
+    struct shadow_status *a;
+    unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
+    unsigned long sl1mfn = __shadow_status(d, gpfn, PGT_l1_shadow);
+    u32 count = 0;
 
-    perfc_incrc(unshadow_table_count);
+    ASSERT(spin_is_locked(&d->arch.shadow_lock));
+    ASSERT(gmfn);
 
-    /*
-     * This function is the same for all p.t. pages. Even for multi-processor 
-     * guests there won't be a race here as this CPU was the one that 
-     * cmpxchg'ed the page to invalid.
-     */
-    smfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
-    delete_shadow_status(d, gpfn);
-    free_shadow_page(d, &frame_table[smfn]);
-}
+    for (i = 0; i < shadow_ht_buckets; i++)
+    {
+        a = &d->arch.shadow_ht[i];
+        while ( a && a->gpfn_and_flags )
+        {
+            if ( ((a->gpfn_and_flags & PGT_type_mask) >= min_type) &&
+                 ((a->gpfn_and_flags & PGT_type_mask) <= max_type) )
+            {
+                switch ( a->gpfn_and_flags & PGT_type_mask )
+                {
+                case PGT_l1_shadow:
+                    count +=
+                        remove_all_write_access_in_ptpage(d, a->smfn, gmfn);
+                    break;
+                case PGT_l2_shadow:
+                    if ( sl1mfn )
+                        count +=
+                            remove_all_write_access_in_ptpage(d, a->smfn,
+                                                              sl1mfn);
+                    break;
+                case PGT_hl2_shadow:
+                    // nothing to do here...
+                    break;
+                default:
+                    // need to flush this out for 4 level page tables.
+                    BUG();
+                }
+            }
+            a = a->next;
+        }
+    }
 
-/*
- * XXX KAF:
- *  1. Why is this VMX specific?
- *  2. Why is VMX using clear_state() rather than free_state()?
- *     (could we get rid of clear_state and fold into free_state?)
- */
-void vmx_shadow_clear_state(struct domain *d)
-{
-    SH_VVLOG("vmx_clear_shadow_state:");
-    shadow_lock(d);
-    clear_shadow_state(d);
-    shadow_unlock(d);
+    return count;
 }
 
-unsigned long shadow_l2_table( 
-    struct domain *d, unsigned long gmfn)
+static u32 remove_all_access_in_page(
+    struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
 {
-    struct pfn_info *spfn_info;
-    unsigned long    spfn;
-    unsigned long    gpfn;
+    unsigned long *pl1e = map_domain_mem(l1mfn << PAGE_SHIFT);
+    unsigned long match = (forbidden_gmfn << PAGE_SHIFT) | _PAGE_PRESENT;
+    unsigned long mask  = PAGE_MASK | _PAGE_PRESENT;
+    int i;
+    u32 count = 0;
+    int is_l1_shadow =
+        ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) ==
+         PGT_l1_shadow);
 
-    gpfn = __mfn_to_gpfn(d, gmfn);
+    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
+    {
+        if ( unlikely(((pl1e[i] ^ match) & mask) == 0) )
+        {
+            unsigned long ol2e = pl1e[i];
+            pl1e[i] = 0;
+            count++;
+
+            if ( is_l1_shadow )
+                put_page_from_l1e(mk_l1_pgentry(ol2e), d);
+            else /* must be an hl2 page */
+                put_page(&frame_table[forbidden_gmfn]);
+        }
+    }
 
-    SH_VVLOG("shadow_l2_table( %p )", gmfn);
+    unmap_domain_mem(pl1e);
 
-    perfc_incrc(shadow_l2_table_count);
+    return count;
+}
 
-    if ( (spfn_info = alloc_shadow_page(d)) == NULL )
-        BUG(); /* XXX Deal gracefully with failure. */
+u32 shadow_remove_all_access(struct domain *d, unsigned long gmfn)
+{
+    int i;
+    struct shadow_status *a;
+    u32 count = 0;
 
-    spfn_info->u.inuse.type_info = PGT_l2_page_table;
-    perfc_incr(shadow_l2_pages);
+    ASSERT(spin_is_locked(&d->arch.shadow_lock));
 
-    spfn = page_to_pfn(spfn_info);
-  /* Mark pfn as being shadowed; update field to point at shadow. */
-    set_shadow_status(d, gpfn, spfn | PSH_shadowed);
-#ifdef __i386__
-    /* Install hypervisor and 2x linear p.t. mapings. */
-    if ( !shadow_mode_translate(d) )
+    for (i = 0; i < shadow_ht_buckets; i++)
     {
-        l2_pgentry_t *spl2e;
-        spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT);
-        /*
-         * We could proactively fill in PDEs for pages that are already
-         * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
-         * (restriction required for coherence of the accessed bit). However,
-         * we tried it and it didn't help performance. This is simpler. 
-         */
-        memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
-
-        /* Install hypervisor and 2x linear p.t. mapings. */
-        memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
-               &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-        spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
-            mk_l2_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-        spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
-            mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-        spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
-            mk_l2_pgentry(__pa(page_get_owner(
-                &frame_table[gmfn])->arch.mm_perdomain_pt) |
-                          __PAGE_HYPERVISOR);
-
-        unmap_domain_mem(spl2e);
+        a = &d->arch.shadow_ht[i];
+        while ( a && a->gpfn_and_flags )
+        {
+            if ( ((a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow) ||
+                 ((a->gpfn_and_flags & PGT_type_mask) == PGT_hl2_shadow) )
+            {
+                count += remove_all_access_in_page(d, a->smfn, gmfn);
+            }
+            a = a->next;
+        }
     }
-#endif
 
-    SH_VLOG("shadow_l2_table( %p -> %p)", gmfn, spfn);
-    return spfn;
-}
-
-static void shadow_map_l1_into_current_l2(unsigned long va)
-{ 
-    struct exec_domain *ed = current;
-    struct domain *d = ed->domain;
-    unsigned long    *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, sl1mfn, sl1ss;
-    struct pfn_info  *sl1mfn_info;
-    int               i;
+    return count;
+}    
 
-    __guest_get_l2e(ed, va, &gl2e);
+static int resync_all(struct domain *d, u32 stype)
+{
+    struct out_of_sync_entry *entry;
+    unsigned i;
+    unsigned long smfn;
+    unsigned long *guest, *shadow, *snapshot;
+    int need_flush = 0, external = shadow_mode_external(d);
 
-    gl1pfn = gl2e >> PAGE_SHIFT;
+    ASSERT(spin_is_locked(&d->arch.shadow_lock));
 
-    sl1ss = __shadow_status(d, gl1pfn);
-    if ( !(sl1ss & PSH_shadowed) )
+    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
     {
-        /* This L1 is NOT already shadowed so we need to shadow it. */
-        SH_VVLOG("4a: l1 not shadowed ( %p )", sl1ss);
+        if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
+            continue;
 
-        sl1mfn_info = alloc_shadow_page(d);
-        sl1mfn_info->u.inuse.type_info = PGT_l1_page_table;
-   
-        sl1mfn = sl1mfn_info - frame_table;
+        if ( !(smfn = __shadow_status(d, entry->gpfn, stype)) )
+            continue;
 
-        perfc_incrc(shadow_l1_table_count);
-        perfc_incr(shadow_l1_pages);
+        FSH_LOG("resyncing t=%p gpfn=%p gmfn=%p smfn=%p snapshot_mfn=%p",
+                stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
 
-        set_shadow_status(d, gl1pfn, PSH_shadowed | sl1mfn);
+        // Compare guest's new contents to its snapshot, validating
+        // and updating its shadow as appropriate.
+        //
+        guest    = map_domain_mem(entry->gmfn         << PAGE_SHIFT);
+        snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT);
+        shadow   = map_domain_mem(smfn                << PAGE_SHIFT);
 
-        l2pde_general(d, &gl2e, &sl2e, sl1mfn);
+        switch ( stype ) {
+        case PGT_l1_shadow:
+            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+            {
+                unsigned new_pte = guest[i];
+                if ( new_pte != snapshot[i] )
+                {
+                    need_flush |= validate_pte_change(d, new_pte, &shadow[i]);
 
-        __guest_set_l2e(ed, va, gl2e);
-        __shadow_set_l2e(ed, va, sl2e);
+                    // can't update snapshots of linear page tables -- they
+                    // are used multiple times...
+                    //
+                    // snapshot[i] = new_pte;
+                }
+            }
+            break;
+        case PGT_l2_shadow:
+            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+            {
+                if ( !is_guest_l2_slot(i) && !external )
+                    continue;
 
-        gpl1e = (unsigned long *) &(linear_pg_table[
-            (va>>L1_PAGETABLE_SHIFT) & ~(L1_PAGETABLE_ENTRIES-1)]);
+                unsigned new_pde = guest[i];
+                if ( new_pde != snapshot[i] )
+                {
+                    need_flush |= validate_pde_change(d, new_pde, &shadow[i]);
 
-        spl1e = (unsigned long *) &(shadow_linear_pg_table[
-            (va>>L1_PAGETABLE_SHIFT) & ~(L1_PAGETABLE_ENTRIES-1)]);
+                    // can't update snapshots of linear page tables -- they
+                    // are used multiple times...
+                    //
+                    // snapshot[i] = new_pde;
+                }
+            }
+            break;
+        default:
+            BUG();
+            break;
+        }
 
-        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-            l1pte_propagate_from_guest(d, &gpl1e[i], &spl1e[i]);
+        unmap_domain_mem(shadow);
+        unmap_domain_mem(snapshot);
+        unmap_domain_mem(guest);
     }
-    else
-    {
-        /* This L1 is shadowed already, but the L2 entry is missing. */
-        SH_VVLOG("4b: was shadowed, l2 missing ( %p )", sl1ss);
 
-        sl1mfn = sl1ss & PSH_pfn_mask;
-        l2pde_general(d, &gl2e, &sl2e, sl1mfn);
-        __guest_set_l2e(ed, va, gl2e);
-        __shadow_set_l2e(ed, va, sl2e);
-    }              
+    return need_flush;
 }
 
-void shadow_invlpg(struct exec_domain *ed, unsigned long va)
+void __shadow_sync_all(struct domain *d)
 {
-    unsigned long gpte, spte;
+    struct out_of_sync_entry *entry;
+    int need_flush = 0;
 
-    ASSERT(shadow_mode_enabled(ed->domain));
+    perfc_incrc(shadow_sync_all);
 
-    /*
-     * XXX KAF: Why is this set-to-zero required?
-     *          Why, on failure, must we bin all our shadow state?
-     */
-    if (__put_user(0L, (unsigned long *)
-                   &shadow_linear_pg_table[va >> PAGE_SHIFT])) {
-        vmx_shadow_clear_state(ed->domain);
-        return;
-    }
+    ASSERT(spin_is_locked(&d->arch.shadow_lock));
 
-    if (__get_user(gpte, (unsigned long *)
-                   &linear_pg_table[va >> PAGE_SHIFT])) {
-        return;
-    }
+    // First, remove all write permissions to the page tables
+    //
+    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
+    {
+        // Skip entries that have low bits set...  Those aren't
+        // real PTEs.
+        //
+        if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
+            continue;
 
-    l1pte_propagate_from_guest(ed->domain, &gpte, &spte);
+        unsigned long *ppte = map_domain_mem(entry->writable_pl1e);
+        unsigned long opte = *ppte;
+        unsigned long npte = opte & ~_PAGE_RW;
 
-    if (__put_user(spte, (unsigned long *)
-                   &shadow_linear_pg_table[va >> PAGE_SHIFT])) {
-        return;
+        get_page_from_l1e(mk_l1_pgentry(npte), d);
+        *ppte = npte;
+        put_page_from_l1e(mk_l1_pgentry(opte), d);
+
+        unmap_domain_mem(ppte);
     }
+
+    // XXX mafetter: SMP perf bug.
+    //
+    // With the current algorithm, we've gotta flush all the TLBs
+    // before we can safely continue.  I don't think we want to
+    // do it this way, so I think we should consider making
+    // entirely private copies of the shadow for each vcpu, and/or
+    // possibly having a mix of private and shared shadow state
+    // (any path from a PTE that grants write access to an out-of-sync
+    // page table page needs to be vcpu private).
+    //
+    flush_tlb_all();
+
+    // Second, resync all L1 pages, then L2 pages, etc...
+    //
+    need_flush |= resync_all(d, PGT_l1_shadow);
+    if ( shadow_mode_translate(d) )
+        need_flush |= resync_all(d, PGT_hl2_shadow);
+    need_flush |= resync_all(d, PGT_l2_shadow);
+
+    if ( need_flush )
+        local_flush_tlb();
+
+    free_out_of_sync_state(d);
 }
 
 int shadow_fault(unsigned long va, struct xen_regs *regs)
 {
-    unsigned long gpte, spte = 0;
+    unsigned long gpte, spte = 0, orig_gpte;
     struct exec_domain *ed = current;
     struct domain *d = ed->domain;
+    unsigned long gpde;
 
     SH_VVLOG("shadow_fault( va=%p, code=%lu )", va, regs->error_code );
-
-    check_pagetable(d, ed->arch.guest_table, "pre-sf");
+    perfc_incrc(shadow_fault_calls);
+    
+    check_pagetable(ed, "pre-sf");
 
     /*
-     * STEP 1. A fast-reject set of checks with no locking.
+     * Don't let someone else take the guest's table pages out-of-sync.
      */
+    shadow_lock(d);
 
-    if ( unlikely(__get_user(gpte, (unsigned long *)
-                             &linear_pg_table[va >> PAGE_SHIFT])) )
-    {
-        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
-        return 0;
-    }
-
-    if ( !(gpte & _PAGE_PRESENT) )
-    {
-        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
-        return 0;
-    }
-
-    if ( (regs->error_code & 2)  && !(gpte & _PAGE_RW) )
-    {
-        /* Write fault on a read-only mapping. */
-        return 0;
-    }
+    /* XXX - FIX THIS COMMENT!!!
+     * STEP 1. Check to see if this fault might have been caused by an
+     *         out-of-sync table page entry, or if we should pass this
+     *         fault onto the guest.
+     */
+    __shadow_sync_va(ed, va);
 
     /*
-     * STEP 2. Take the shadow lock and re-check the guest PTE.
+     * STEP 2. Check the guest PTE.
      */
-
-    shadow_lock(d);
-    if ( unlikely(__get_user(gpte, (unsigned long *)
-                             &linear_pg_table[va >> PAGE_SHIFT])) )
+    __guest_get_l2e(ed, va, &gpde);
+    if ( unlikely(!(gpde & _PAGE_PRESENT)) )
     {
-        SH_VVLOG("shadow_fault - EXIT: read gpte faulted2" );
+        SH_VVLOG("shadow_fault - EXIT: L1 not present" );
+        perfc_incrc(shadow_fault_bail_pde_not_present);
         shadow_unlock(d);
         return 0;
     }
 
+    // This can't fault because we hold the shadow lock and we've ensured that
+    // the mapping is in-sync, so the check of the PDE's present bit, above,
+    // covers this access.
+    //
+    orig_gpte = gpte = l1_pgentry_val(linear_pg_table[l1_linear_offset(va)]);
     if ( unlikely(!(gpte & _PAGE_PRESENT)) )
     {
-        SH_VVLOG("shadow_fault - EXIT: gpte not present2 (%lx)",gpte );
+        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
+        perfc_incrc(shadow_fault_bail_pte_not_present);
         shadow_unlock(d);
         return 0;
     }
@@ -672,11 +1812,12 @@ int shadow_fault(unsigned long va, struct xen_regs *regs)
         {
             /* Write fault on a read-only mapping. */
             SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", gpte);
+            perfc_incrc(shadow_fault_bail_ro_mapping);
             shadow_unlock(d);
             return 0;
         }
 
-        l1pte_write_fault(d, &gpte, &spte);
+        l1pte_write_fault(ed, &gpte, &spte, va);
     }
     else
     {
@@ -689,120 +1830,141 @@ int shadow_fault(unsigned long va, struct xen_regs *regs)
 
     /* XXX Watch out for read-only L2 entries! (not used in Linux). */
     if ( unlikely(__put_user(gpte, (unsigned long *)
-                             &linear_pg_table[va >> PAGE_SHIFT])) )
-        domain_crash();
-
-    /*
-     * Update of shadow PTE can fail because the L1 p.t. is not shadowed,
-     * or because the shadow isn't linked into this shadow L2 p.t.
-     */
-    if ( unlikely(__put_user(spte, (unsigned long *)
-                             &shadow_linear_pg_table[va >> PAGE_SHIFT])) )
+                             &linear_pg_table[l1_linear_offset(va)])) )
     {
-        SH_VVLOG("3: not shadowed/mapped gpte=%p spte=%p", gpte, spte);
-        shadow_map_l1_into_current_l2(va);
-        shadow_linear_pg_table[va >> PAGE_SHIFT] = mk_l1_pgentry(spte);
+        printk("shadow_fault(): crashing domain %d "
+               "due to a read-only L2 page table (gpde=%p), va=%p\n",
+               d->id, gpde, va);
+        domain_crash();
     }
 
-    perfc_incrc(shadow_fixup_count);
+    // if necessary, record the page table page as dirty
+    if ( unlikely(shadow_mode_log_dirty(d)) && (orig_gpte != gpte) )
+        mark_dirty(d, __gpfn_to_mfn(d, gpde >> PAGE_SHIFT));
+
+    shadow_set_l1e(va, spte, 1);
+
+    perfc_incrc(shadow_fault_fixed);
     d->arch.shadow_fault_count++;
 
     shadow_unlock(d);
 
-    check_pagetable(d, ed->arch.guest_table, "post-sf");
+    check_pagetable(ed, "post-sf");
     return EXCRET_fault_fixed;
 }
 
-
-void shadow_l1_normal_pt_update(
-    unsigned long pa, unsigned long gpte,
-    unsigned long *prev_smfn_ptr,
-    l1_pgentry_t **prev_spl1e_ptr)
+/*
+ * What lives where in the 32-bit address space in the various shadow modes,
+ * and what it uses to get/maintain that mapping.
+ *
+ * SHADOW MODE:      none         enable         translate         external
+ * 
+ * 4KB things:
+ * guest_vtable    lin_l2     mapped per gpdt  lin_l2 via hl2   mapped per gpdt
+ * shadow_vtable     n/a         sh_lin_l2       sh_lin_l2      mapped per gpdt
+ * hl2_vtable        n/a            n/a        lin_hl2 via hl2  mapped per gpdt
+ * monitor_vtable    n/a            n/a             n/a           mapped once
+ *
+ * 4MB things:
+ * guest_linear  lin via gpdt   lin via gpdt     lin via hl2      lin via hl2
+ * shadow_linear     n/a      sh_lin via spdt  sh_lin via spdt  sh_lin via spdt
+ * monitor_linear    n/a            n/a             n/a              ???
+ * perdomain      perdomain      perdomain       perdomain        perdomain
+ * R/O M2P         R/O M2P        R/O M2P           n/a              n/a
+ * R/W M2P         R/W M2P        R/W M2P         R/W M2P          R/W M2P
+ * P2M               n/a            n/a           R/O M2P          R/O M2P
+ *
+ * NB:
+ * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
+ * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
+ * all play a part in maintaining these mappings.
+ */
+void __update_pagetables(struct exec_domain *ed)
 {
-    unsigned long smfn, spte, prev_smfn = *prev_smfn_ptr;    
-    l1_pgentry_t *spl1e, *prev_spl1e = *prev_spl1e_ptr;
+    struct domain *d = ed->domain;
+    unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
+    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
+    unsigned long smfn, hl2mfn;
 
-    /* N.B. To get here, we know the l1 page *must* be shadowed. */
-    SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%p, "
-             "prev_smfn=%p, prev_spl1e=%p",
-             pa, gpte, prev_smfn, prev_spl1e);
+    int max_mode = ( shadow_mode_external(d) ? SHM_external
+                     : shadow_mode_translate(d) ? SHM_translate
+                     : shadow_mode_enabled(d) ? SHM_enable
+                     : 0 );
 
-    smfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
+    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
+    ASSERT( max_mode );
 
-    if ( smfn == prev_smfn )
-    {
-        spl1e = prev_spl1e;
-    }
-    else
+    /*
+     *  arch.guest_vtable
+     */
+    if ( max_mode & (SHM_enable | SHM_external) )
     {
-        if ( prev_spl1e != NULL )
-            unmap_domain_mem( prev_spl1e );
-        spl1e = (l1_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
-        *prev_smfn_ptr  = smfn;
-        *prev_spl1e_ptr = spl1e;
+        if ( likely(ed->arch.guest_vtable != NULL) )
+            unmap_domain_mem(ed->arch.guest_vtable);
+        ed->arch.guest_vtable = map_domain_mem(gmfn << PAGE_SHIFT);
     }
 
-    l1pte_propagate_from_guest(current->domain, &gpte, &spte);
-    spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = mk_l1_pgentry(spte);
-}
-
-void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde)
-{
-    unsigned long sl2mfn, spde = 0;
-    l2_pgentry_t *spl2e;
-    unsigned long sl1mfn;
-
-    /* N.B. To get here, we know the l2 page *must* be shadowed. */
-    SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%p",pa,gpde);
+    /*
+     *  arch.shadow_table
+     */
+    if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
+        smfn = shadow_l2_table(d, gpfn, gmfn);
+    get_shadow_ref(smfn);
+    if ( pagetable_val(ed->arch.shadow_table) )
+        put_shadow_ref(pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT);
+    ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
 
-    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
+    SH_VVLOG("0: __update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
 
     /*
-     * Only propagate to shadow if _PAGE_ACCESSED is set in the guest.
-     * Otherwise, to ensure coherency, we blow away the existing shadow value.
+     * arch.shadow_vtable
      */
-    if ( gpde & _PAGE_ACCESSED )
+    if ( max_mode == SHM_external )
     {
-        sl1mfn = (gpde & _PAGE_PRESENT) ?
-            __shadow_status(current->domain, gpde >> PAGE_SHIFT) : 0;
-        l2pde_general(current->domain, &gpde, &spde, sl1mfn);
+        if ( ed->arch.shadow_vtable )
+            unmap_domain_mem(ed->arch.shadow_vtable);
+        ed->arch.shadow_vtable = map_domain_mem(smfn << PAGE_SHIFT);
     }
 
-    spl2e = (l2_pgentry_t *)map_domain_mem(sl2mfn << PAGE_SHIFT);
-    spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)] = mk_l2_pgentry(spde);
-    unmap_domain_mem(spl2e);
-}
+    /*
+     * arch.hl2_vtable
+     */
 
-unsigned long mk_hl2_table(struct exec_domain *ed)
-{
-    struct domain *d = ed->domain;
-    unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
-    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
-    unsigned long hl2mfn, status;
-    struct pfn_info *hl2_info;
-    l1_pgentry_t *hl2;
+    // if max_mode == SHM_translate, then the hl2 is already installed
+    // correctly in its smfn, and there's nothing to do.
+    //
+    if ( max_mode == SHM_external )
+    {
+        if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
+            hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
+        get_shadow_ref(hl2mfn);
 
-    perfc_incr(hl2_table_pages);
+        if ( ed->arch.hl2_vtable )
+            unmap_domain_mem(ed->arch.hl2_vtable);
+        ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
+    }
 
-    if ( (hl2_info = alloc_shadow_page(d)) == NULL )
-        BUG(); /* XXX Deal gracefully with failure. */
+    /*
+     * fixup pointers in monitor table, as necessary
+     */
+    if ( max_mode == SHM_external )
+    {
+        l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
 
-    hl2_info->u.inuse.type_info = PGT_l1_page_table;
+        ASSERT( shadow_mode_translate(d) );
 
-    hl2mfn = page_to_pfn(hl2_info);
-    status = hl2mfn | PSH_hl2;
-    set_shadow_status(ed->domain, gpfn | PSH_hl2, status);
+        mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
+            mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 
-    // need to optimize this...
-    hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
-    memset(hl2, 0, PAGE_SIZE);
-    unmap_domain_mem(hl2);
+        mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+            mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 
-    return status;
+        // XXX - maybe this can be optimized somewhat??
+        local_flush_tlb();
+    }
 }
 
 
-
 /************************************************************************/
 /************************************************************************/
 /************************************************************************/
@@ -838,12 +2000,13 @@ int shadow_status_noswap;
 
 static int check_pte(
     struct domain *d, unsigned long *pgpte, unsigned long *pspte, 
-    int level, int l2_idx, int l1_idx)
+    int level, int l2_idx, int l1_idx, int oos_ptes)
 {
     unsigned gpte = *pgpte;
     unsigned spte = *pspte;
-    unsigned long mask, gpfn, smfn;
+    unsigned long mask, gpfn, smfn, gmfn;
     int errors = 0;
+    int page_table_page;
 
     if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) )
         return errors;  /* always safe */
@@ -862,21 +2025,36 @@ static int check_pte(
     if ( (spte & mask) != (gpte & mask) )
         FAIL("Corrupt?");
 
-    if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) )
+    if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) && !oos_ptes )
         FAIL("Dirty coherence");
 
-    if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) )
+    if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) && !oos_ptes )
         FAIL("Accessed coherence");
 
-    if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) )
+    smfn = spte >> PAGE_SHIFT;
+    gpfn = gpte >> PAGE_SHIFT;
+    gmfn = __gpfn_to_mfn(d, gpfn);
+
+    page_table_page = mfn_is_page_table(gmfn);
+
+    if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) && !oos_ptes )
+    {
+        printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d oos_ptes=%d\n",
+               gpfn, gmfn, smfn,
+               frame_table[gmfn].u.inuse.type_info,
+               page_table_page, oos_ptes);
         FAIL("RW coherence");
+    }
 
-    if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) )
+    if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) && !oos_ptes )
+    {
+        printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d oos_ptes=%d\n",
+               gpfn, gmfn, smfn,
+               frame_table[gmfn].u.inuse.type_info,
+               page_table_page, oos_ptes);
         FAIL("RW2 coherence");
+    }
  
-    smfn = spte >> PAGE_SHIFT;
-    gpfn = gpte >> PAGE_SHIFT;
-
     if ( gpfn == smfn )
     {
         if ( level > 1 )
@@ -887,23 +2065,26 @@ static int check_pte(
         if ( level < 2 )
             FAIL("Shadow in L1 entry?");
 
-        if ( __shadow_status(d, gpfn) != (PSH_shadowed | smfn) )
-            FAIL("smfn problem g.sf=%p", 
-                 __shadow_status(d, gpfn) );
+        if ( level == 2 )
+        {
+            if ( __shadow_status(d, gpfn, PGT_l1_shadow) != smfn )
+                FAIL("smfn problem gpfn=%p smfn=%p", gpfn,
+                     __shadow_status(d, gpfn, PGT_l1_shadow));
+        }
+        else
+            BUG(); // XXX -- not handled yet.
     }
 
     return errors;
 }
 
-
 static int check_l1_table(
-    struct domain *d,
+    struct domain *d, unsigned long gpfn,
     unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
 {
     int i;
     unsigned long *gpl1e, *spl1e;
-    int cpu = current->processor;
-    int errors = 0;
+    int errors = 0, oos_ptes = 0;
 
     // First check to see if this guest page is currently the active
     // PTWR page.  If so, then we compare the (old) cached copy of the
@@ -912,6 +2093,8 @@ static int check_l1_table(
     //
     if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) )
     {
+        int cpu = current->processor;
+
         for ( i = 0; i < ARRAY_SIZE(ptwr_info->ptinfo); i++)
         {
             if ( ptwr_info[cpu].ptinfo[i].l1va &&
@@ -925,11 +2108,18 @@ static int check_l1_table(
         }
     }
 
+    if ( page_out_of_sync(pfn_to_page(gmfn)) )
+    {
+        gmfn = __shadow_status(d, gpfn, PGT_snapshot);
+        oos_ptes = 1;
+        ASSERT(gmfn);
+    }
+
     gpl1e = map_domain_mem(gmfn << PAGE_SHIFT);
     spl1e = map_domain_mem(smfn << PAGE_SHIFT);
 
     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-        errors += check_pte(d, &gpl1e[i], &spl1e[i], 1, l2_idx, i);
+        errors += check_pte(d, &gpl1e[i], &spl1e[i], 1, l2_idx, i, oos_ptes);
  
     unmap_domain_mem(spl1e);
     unmap_domain_mem(gpl1e);
@@ -944,20 +2134,23 @@ static int check_l1_table(
     } while ( 0 )
 
 int check_l2_table(
-    struct domain *d, unsigned long gpfn, unsigned long smfn)
+    struct domain *d, unsigned long gmfn, unsigned long smfn, int oos_pdes)
 {
-    unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
-    l2_pgentry_t *gpl2e = (l2_pgentry_t *) map_domain_mem( gmfn << PAGE_SHIFT );
-    l2_pgentry_t *spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
+    l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_mem(gmfn << PAGE_SHIFT);
+    l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
     int i;
     int errors = 0;
+    int limit;
 
-    if ( page_get_owner(pfn_to_page(gmfn)) != d )
+    if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) )
         FAILPT("domain doesn't own page");
+    if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) )
+        FAILPT("bogus owner for snapshot page");
     if ( page_get_owner(pfn_to_page(smfn)) != NULL )
         FAILPT("shadow page mfn=0x%08x is owned by someone, domid=%d",
                smfn, page_get_owner(pfn_to_page(smfn))->id);
 
+#if 0
     if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
                 &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
                 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
@@ -974,40 +2167,62 @@ int check_l2_table(
     if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
           l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
         FAILPT("hypervisor linear map inconsistent");
+#endif
 
-    if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> 
+    if ( !shadow_mode_external(d) &&
+         (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> 
                                L2_PAGETABLE_SHIFT]) != 
           ((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
+    {
         FAILPT("hypervisor shadow linear map inconsistent %p %p",
                l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >>
                                     L2_PAGETABLE_SHIFT]),
                (smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+    }
 
-    if ( !shadow_mode_translate(d) ) {
-        if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
-              ((v2m(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt) |
-                __PAGE_HYPERVISOR))) )
-            FAILPT("hypervisor per-domain map inconsistent");
+    if ( !shadow_mode_external(d) &&
+         (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
+              ((__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR))) )
+    {
+        FAILPT("hypervisor per-domain map inconsistent saw %p, expected (va=%p) %p",
+               l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
+               d->arch.mm_perdomain_pt,
+               (__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR));
     }
 
+    if ( shadow_mode_external(d) )
+        limit = L2_PAGETABLE_ENTRIES;
+    else
+        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+
     /* Check the whole L2. */
-    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
-        errors += check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i, 0);
+    for ( i = 0; i < limit; i++ )
+        errors += check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i, 0, 0);
 
     unmap_domain_mem(spl2e);
     unmap_domain_mem(gpl2e);
 
+#if 1
+    if ( errors )
+        printk("check_l2_table returning %d errors\n", errors);
+#endif
+
     return errors;
 }
 
-int _check_pagetable(struct domain *d, pagetable_t pt, char *s)
+int _check_pagetable(struct exec_domain *ed, char *s)
 {
+    struct domain *d = ed->domain;
+    pagetable_t pt = ed->arch.guest_table;
     unsigned long gptbase = pagetable_val(pt);
-    unsigned long ptbase_pfn, smfn, ss;
+    unsigned long ptbase_pfn, smfn;
     unsigned long i;
     l2_pgentry_t *gpl2e, *spl2e;
     unsigned long ptbase_mfn = 0;
-    int errors = 0;
+    int errors = 0, limit, oos_pdes = 0;
+
+    audit_domain(d);
+    shadow_lock(d);
 
     sh_check_name = s;
     SH_VVLOG("%s-PT Audit", s);
@@ -1017,30 +2232,31 @@ int _check_pagetable(struct domain *d, pagetable_t pt, char *s)
     ptbase_pfn = gptbase >> PAGE_SHIFT;
     ptbase_mfn = __gpfn_to_mfn(d, ptbase_pfn);
 
-    ss = __shadow_status(d, ptbase_pfn);
-  
-    if ( ! (ss & PSH_shadowed) )
+    if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
     {
         printk("%s-PT %p not shadowed\n", s, gptbase);
         errors++;
-
-        if ( ss != 0 )
-            BUG();
-        return errors;
-    }   
+        goto out;
+    }
+    if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) )
+    {
+        ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
+        oos_pdes = 1;
+        ASSERT(ptbase_mfn);
+    }
  
-    smfn = ss & PSH_pfn_mask;
-
-    if ( ss != (PSH_shadowed | smfn) )
-        FAILPT("ptbase shadow inconsistent1");
-
-    errors += check_l2_table(d, ptbase_pfn, smfn);
+    errors += check_l2_table(d, ptbase_mfn, smfn, oos_pdes);
 
     gpl2e = (l2_pgentry_t *) map_domain_mem( ptbase_mfn << PAGE_SHIFT );
     spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
 
     /* Go back and recurse. */
-    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+    if ( shadow_mode_external(d) )
+        limit = L2_PAGETABLE_ENTRIES;
+    else
+        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+
+    for ( i = 0; i < limit; i++ )
     {
         unsigned long gl1pfn = l2_pgentry_val(gpl2e[i]) >> PAGE_SHIFT;
         unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
@@ -1048,7 +2264,7 @@ int _check_pagetable(struct domain *d, pagetable_t pt, char *s)
 
         if ( l2_pgentry_val(spl2e[i]) != 0 )
         {
-            errors += check_l1_table(d, gl1mfn, sl1mfn, i);
+            errors += check_l1_table(d, gl1pfn, gl1mfn, sl1mfn, i);
         }
     }
 
@@ -1057,22 +2273,23 @@ int _check_pagetable(struct domain *d, pagetable_t pt, char *s)
 
     SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
              sh_l2_present, sh_l1_present);
-#if 1
+
+ out:
     if ( errors )
         BUG();
-#endif
+
+    shadow_unlock(d);
 
     return errors;
 }
 
-int _check_all_pagetables(struct domain *d, char *s)
+int _check_all_pagetables(struct exec_domain *ed, char *s)
 {
-    int i, j;
+    struct domain *d = ed->domain;
+    int i;
     struct shadow_status *a;
     unsigned long gmfn;
     int errors = 0;
-    int cpu;
 
     shadow_status_noswap = 1;
 
@@ -1084,22 +2301,34 @@ int _check_all_pagetables(struct domain *d, char *s)
     for (i = 0; i < shadow_ht_buckets; i++)
     {
         a = &d->arch.shadow_ht[i];
-        while ( a && a->pfn )
+        while ( a && a->gpfn_and_flags )
         {
-            gmfn = __gpfn_to_mfn(d, a->pfn);
-            switch ( frame_table[a->pfn].u.inuse.type_info & PGT_type_mask )
+            gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
+
+            switch ( a->gpfn_and_flags & PGT_type_mask )
             {
-            case PGT_l1_page_table:
-                errors += check_l1_table(d, gmfn, a->smfn_and_flags & PSH_pfn_mask, 0);
+            case PGT_l1_shadow:
+                errors += check_l1_table(d, a->gpfn_and_flags & PGT_mfn_mask,
+                                         gmfn, a->smfn, 0);
+                break;
+            case PGT_l2_shadow:
+                errors += check_l2_table(d, gmfn, a->smfn,
+                                         page_out_of_sync(pfn_to_page(gmfn)));
                 break;
-            case PGT_l2_page_table:
-                errors += check_l2_table(d, gmfn, a->smfn_and_flags & PSH_pfn_mask);
+            case PGT_l3_shadow:
+            case PGT_l4_shadow:
+            case PGT_hl2_shadow:
+                BUG(); // XXX - ought to fix this...
+                break;
+            case PGT_snapshot:
                 break;
             default:
                 errors++;
-                printk("unexpected page type 0x%08x, pfn=0x%08x, gmfn=0x%08x\n",
-                       frame_table[gmfn].u.inuse.type_info,
-                       a->pfn, gmfn);
+                printk("unexpected shadow type %p, gpfn=%p, "
+                       "gmfn=%p smfn=%p\n",
+                       a->gpfn_and_flags & PGT_type_mask,
+                       a->gpfn_and_flags & PGT_mfn_mask,
+                       gmfn, a->smfn);
                 BUG();
             }
             a = a->next;
@@ -1108,52 +2337,8 @@ int _check_all_pagetables(struct domain *d, char *s)
 
     shadow_status_noswap = 0;
 
-    for (i = 0; i < 1024; i++)
-    {
-        if ( l2_pgentry_val(shadow_linear_l2_table[i]) & _PAGE_PRESENT )
-        {
-            unsigned base = i << 10;
-            for (j = 0; j < 1024; j++)
-            {
-                if ( (l1_pgentry_val(shadow_linear_pg_table[base + j]) & PAGE_MASK) == 0x0143d000 )
-                {
-                    printk("sh_ln_pg_tb[0x%08x] => 0x%08lx ",
-                           base + j,
-                           l1_pgentry_val(shadow_linear_pg_table[base + j]));
-                    if ( l1_pgentry_val(shadow_linear_pg_table[base + j]) & _PAGE_PRESENT )
-                        printk(" first entry => 0x%08lx\n",
-                               *(unsigned long *)((base + j) << PAGE_SHIFT));
-                    else
-                        printk(" page not present\n");
-                }
-            }
-        }
-    }
-
     if ( errors )
-    {
-        printk("VM_ASSIST(d, VMASST_TYPE_writable_pagetables) => %d\n",
-               VM_ASSIST(d, VMASST_TYPE_writable_pagetables));
-        for ( cpu = 0; cpu < smp_num_cpus; cpu++ )
-        {
-            for ( j = 0; j < ARRAY_SIZE(ptwr_info->ptinfo); j++)
-            {
-                printk("ptwr_info[%d].ptinfo[%d].l1va => 0x%08x\n",
-                       cpu, j, ptwr_info[cpu].ptinfo[j].l1va);
-                printk("ptwr_info[%d].ptinfo[%d].pl1e => 0x%08x\n",
-                       cpu, j, ptwr_info[cpu].ptinfo[j].pl1e);
-                if (cpu == smp_processor_id())
-                    printk("v2m(ptwr_info[%d].ptinfo[%d].pl1e) => 0x%08x\n",
-                           cpu, j, v2m(ptwr_info[cpu].ptinfo[j].pl1e));
-                printk("ptwr_info[%d].ptinfo[%d].page => 0x%08x\n",
-                       cpu, j, ptwr_info[cpu].ptinfo[j].page);
-                if (cpu == smp_processor_id())
-                    printk("v2m(ptwr_info[%d].ptinfo[%d].page) => 0x%08x\n",
-                           cpu, j, v2m(ptwr_info[cpu].ptinfo[j].page));
-            }
-        }
         BUG();
-    }
 
     return errors;
 }
index 422c737c43baa86e4c259febd9045600048273b9..8516e0b59ba3a4ba3cabf4575b93a26754a0e3c9 100644 (file)
@@ -114,7 +114,7 @@ asmlinkage void fatal_trap(int trapnr, struct xen_regs *regs)
     if ( trapnr == TRAP_page_fault )
     {
         __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : );
-        printk("Faulting linear address might be %0lx %lx\n", cr2, cr2);
+        printk("Faulting linear address might be %p\n", cr2);
     }
 
     printk("************************************\n");
@@ -269,6 +269,8 @@ asmlinkage int do_page_fault(struct xen_regs *regs)
 
     DEBUGGER_trap_entry(TRAP_page_fault, regs);
 
+    //printk("do_page_fault(eip=%p, va=%p, code=%d)\n", regs->eip, addr, regs->error_code);
+
     perfc_incrc(page_faults);
 
     if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
@@ -295,9 +297,12 @@ asmlinkage int do_page_fault(struct xen_regs *regs)
         UNLOCK_BIGLOCK(d);
     }
 
-    if ( unlikely(shadow_mode_enabled(d)) && 
-         (addr < PAGE_OFFSET) && shadow_fault(addr, regs) )
+    if ( unlikely(shadow_mode_enabled(d)) &&
+         ((addr < PAGE_OFFSET) || shadow_mode_external(d)) &&
+         shadow_fault(addr, regs) )
+    {
         return EXCRET_fault_fixed;
+    }
 
     if ( unlikely(addr >= LDT_VIRT_START(ed)) && 
          (addr < (LDT_VIRT_START(ed) + (ed->arch.ldt_ents*LDT_ENTRY_SIZE))) )
index ada1403714880660a27e4b8bf0c844c91caa8c59..5ef572ef02a1285a7920d9cf972394bc37681c40 100644 (file)
@@ -106,6 +106,7 @@ static void inline __update_guest_eip(unsigned long inst_len)
 
 static int vmx_do_page_fault(unsigned long va, struct xen_regs *regs) 
 {
+    struct exec_domain *ed = current;
     unsigned long eip;
     unsigned long gpte, gpa;
     int result;
@@ -123,9 +124,9 @@ static int vmx_do_page_fault(unsigned long va, struct xen_regs *regs)
      * If vpagetable is zero, then we are still emulating 1:1 page tables,
      * and we should have never gotten here.
      */
-    if ( !current->arch.guest_vtable )
+    if ( !test_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state) )
     {
-        printk("vmx_do_page_fault while still running on 1:1 page table\n");
+        printk("vmx_do_page_fault while running on 1:1 page table\n");
         return 0;
     }
 
@@ -269,21 +270,17 @@ static void vmx_vmexit_do_invlpg(unsigned long va)
 {
     unsigned long eip;
     struct exec_domain *ed = current;
-    unsigned int index;
 
     __vmread(GUEST_EIP, &eip);
 
-    VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg:eip=%p, va=%p",
-            eip, va);
+    VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%p, va=%p",
+                eip, va);
 
     /*
      * We do the safest things first, then try to update the shadow
      * copying from guest
      */
     shadow_invlpg(ed, va);
-    index = l2_table_offset(va);
-    ed->arch.hl2_vtable[index] = 
-        mk_l2_pgentry(0); /* invalidate pgd cache */
 }
 
 static void vmx_io_instruction(struct xen_regs *regs, 
@@ -428,14 +425,6 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs)
             }
             old_base_mfn = pagetable_val(d->arch.guest_table) >> PAGE_SHIFT;
 
-            /* We know that none of the previous 1:1 shadow pages are
-             * going to be used again, so might as well flush them.
-             * XXXX wait until the last VCPU boots before doing the flush !!
-             */
-            shadow_lock(d->domain);
-            free_shadow_state(d->domain); // XXX SMP
-            shadow_unlock(d->domain);
-
             /*
              * Now arch.guest_table points to machine physical.
              */
@@ -469,7 +458,6 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs)
             break;
         }
         
-        hl2_table_invalidate(d);
         /*
          * We make a new one if the shadow does not exist.
          */
@@ -482,8 +470,7 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs)
             mfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
             if ((mfn << PAGE_SHIFT) != pagetable_val(d->arch.guest_table))
                 __vmx_bug(regs);
-            vmx_shadow_clear_state(d->domain);
-            shadow_invalidate(d);
+            shadow_sync_all(d->domain);
         } else {
             /*
              * If different, make a shadow. Check if the PDBR is valid
@@ -525,8 +512,6 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs)
          */
         if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) {
             vmx_shadow_clear_state(d->domain);
-            shadow_invalidate(d);
-            hl2_table_invalidate(d);
         }
         break;
     default:
index d3ec56cd17cb0baafeea1934f9019e5ee6b02cad..2feeaebb3d9ce05fb5ae80de129e67bfeabb1af5 100644 (file)
@@ -85,6 +85,8 @@ void *map_domain_mem(unsigned long pa)
 void unmap_domain_mem(void *va)
 {
     unsigned int idx;
+    ASSERT((void *)MAPCACHE_VIRT_START <= va);
+    ASSERT(va < (void *)MAPCACHE_VIRT_END);
     idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT;
     mapcache[idx] |= READY_FOR_TLB_FLUSH;
 }
index 3e7608b59862d15bd0878719f1a9fe9900ad75d0..b4553f5d1ce8c92844477933e2bb8a4a7fbd7c7e 100644 (file)
@@ -14,6 +14,7 @@
 #include <xen/sched.h>
 #include <xen/event.h>
 #include <asm/domain_page.h>
+#include <asm/shadow.h>
 
 /*
  * To allow safe resume of do_dom_mem_op() after preemption, we need to know 
@@ -111,6 +112,27 @@ free_dom_mem(struct domain *d,
             if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
                 put_page(page);
 
+            if ( unlikely(shadow_mode_enabled(d)) )
+            {
+                // XXX This needs more thought.  This isn't pretty,
+                // and it's not fast.  But it's a place holder.
+                //
+                shadow_lock(d);
+                if ( page_out_of_sync(page) )
+                    __shadow_sync_mfn(d, mpfn + j);
+                shadow_remove_all_access(d, mpfn + j);
+
+                if (page->count_info != 1)
+                {
+                    printk("free_dom_mem in shadow mode didn't release page "
+                           "mfn=%p c=%p\n", mpfn+j, page->count_info);
+                    shadow_unlock(d);
+                    audit_domain(d);
+                    BUG();
+                }
+                shadow_unlock(d);
+            }
+
             put_page(page);
         }
     }
index 61b4b2b4a805ca9704136bcb7dcda2d7f2f487eb..60c0adc35af2d2c18797092a74a19821b1760273 100644 (file)
@@ -29,6 +29,7 @@
 #include <xen/slab.h>
 #include <xen/irq.h>
 #include <asm/domain_page.h>
+#include <asm/shadow.h>
 
 /*
  * Comma-separated list of hexadecimal page numbers containing bad bytes.
@@ -566,7 +567,23 @@ void free_domheap_pages(struct pfn_info *pg, unsigned int order)
 
         for ( i = 0; i < (1 << order); i++ )
         {
-            ASSERT((pg[i].u.inuse.type_info & PGT_count_mask) == 0);
+            if ( ((pg[i].u.inuse.type_info & PGT_count_mask) != 0) &&
+                shadow_mode_enabled(d) )
+            {
+                // XXX This needs more thought...
+                //
+                printk("%s: needing to call shadow_remove_all_access for mfn=%p\n",
+                       __func__, page_to_pfn(&pg[i]));
+                printk("Amfn=%p c=%p t=%p\n", page_to_pfn(&pg[i]),
+                       pg[i].count_info, pg[i].u.inuse.type_info);
+                shadow_lock(d);
+                shadow_remove_all_access(d, page_to_pfn(&pg[i]));
+                shadow_unlock(d);
+                printk("Bmfn=%p c=%p t=%p\n", page_to_pfn(&pg[i]),
+                       pg[i].count_info, pg[i].u.inuse.type_info);
+            }
+
+            ASSERT( (pg[i].u.inuse.type_info & PGT_count_mask) == 0 );
             pg[i].tlbflush_timestamp  = tlbflush_current_time();
             pg[i].u.free.cpu_mask     = cpu_mask;
             list_del(&pg[i].list);
index f5d4727ff7c168f23ad34341d94224c1effa36ef..f28a352eeddba7637cff8d80d456c96d6e497e11 100644 (file)
@@ -423,6 +423,9 @@ void __enter_scheduler(void)
     
     perfc_incrc(sched_ctx);
 
+    // Q: With full shadow mode, do we need to flush out-of-sync pages
+    //    before switching domains?  Current belief is NO.
+
     if ( !is_idle_task(prev->domain) )
     {
         LOCK_BIGLOCK(prev->domain);
index d8bb48043df7850663dcc1434800c7f7fb898ae1..02276cbb55fc34be4f6202b80ba1262619afa0bc 100644 (file)
@@ -35,11 +35,21 @@ struct arch_domain
     unsigned int shadow_dirty_bitmap_size;  /* in pages, bit per page */
 
     /* shadow mode stats */
-    unsigned int shadow_page_count;     
-    unsigned int shadow_fault_count;     
-    unsigned int shadow_dirty_count;     
-    unsigned int shadow_dirty_net_count;     
-    unsigned int shadow_dirty_block_count;     
+    unsigned int shadow_page_count;
+    unsigned int hl2_page_count;
+    unsigned int snapshot_page_count;
+
+    unsigned int shadow_fault_count;
+    unsigned int shadow_dirty_count;
+    unsigned int shadow_dirty_net_count;
+    unsigned int shadow_dirty_block_count;
+
+    /* full shadow mode */
+    struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */
+    struct out_of_sync_entry *out_of_sync_free;
+    struct out_of_sync_entry *out_of_sync_extras;
+    unsigned int out_of_sync_extras_count;
+
 } __cacheline_aligned;
 
 struct arch_exec_domain
@@ -109,8 +119,8 @@ struct arch_exec_domain
 
     l2_pgentry_t *guest_vtable;         /* virtual address of pagetable */
     l2_pgentry_t *shadow_vtable;        /* virtual address of shadow_table */
-    l2_pgentry_t *hl2_vtable;                  /* virtual address of hl2_table */
     l2_pgentry_t *monitor_vtable;              /* virtual address of monitor_table */
+    l1_pgentry_t *hl2_vtable;                  /* virtual address of hl2_table */
 
     /* Virtual CR2 value. Can be read/written by guest. */
     unsigned long guest_cr2;
index 3e4b1d4b0b51f16581f75f61e876f11a2f9db035..7cb895e9fc13cb832010d4a5a1343c272fb0e3ad 100644 (file)
@@ -69,7 +69,16 @@ struct pfn_info
 #define PGT_gdt_page        (5<<29) /* using this page in a GDT? */
 #define PGT_ldt_page        (6<<29) /* using this page in an LDT? */
 #define PGT_writable_page   (7<<29) /* has writable mappings of this page? */
+
+#define PGT_l1_shadow       PGT_l1_page_table
+#define PGT_l2_shadow       PGT_l2_page_table
+#define PGT_l3_shadow       PGT_l3_page_table
+#define PGT_l4_shadow       PGT_l4_page_table
+#define PGT_hl2_shadow      (5<<29)
+#define PGT_snapshot        (6<<29)
+
 #define PGT_type_mask       (7<<29) /* Bits 29-31. */
+
  /* Has this page been validated for use as its current type? */
 #define _PGT_validated      28
 #define PGT_validated       (1U<<_PGT_validated)
@@ -86,11 +95,19 @@ struct pfn_info
  /* 17-bit count of uses of this frame as its current type. */
 #define PGT_count_mask      ((1U<<17)-1)
 
+#define PGT_mfn_mask        ((1U<<21)-1) /* mfn mask for shadow types */
+
  /* Cleared when the owning guest 'frees' this page. */
 #define _PGC_allocated      31
 #define PGC_allocated       (1U<<_PGC_allocated)
- /* 31-bit count of references to this frame. */
-#define PGC_count_mask      ((1U<<31)-1)
+ /* Set when fullshadow mode marks a page out-of-sync */
+#define _PGC_out_of_sync     30
+#define PGC_out_of_sync     (1U<<_PGC_out_of_sync)
+ /* Set when fullshadow mode is using a page as a page table */
+#define _PGC_page_table      29
+#define PGC_page_table      (1U<<_PGC_page_table)
+ /* 29-bit count of references to this frame. */
+#define PGC_count_mask      ((1U<<29)-1)
 
 /* We trust the slab allocator in slab.c, and our use of it. */
 #define PageSlab(page)     (1)
@@ -112,6 +129,8 @@ static inline u32 pickle_domptr(struct domain *domain)
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
+#define page_out_of_sync(_p)  ((_p)->count_info & PGC_out_of_sync)
+
 #define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                                   \
     do {                                                                    \
         page_set_owner((_pfn), (_dom));                                     \
@@ -135,6 +154,11 @@ void init_frametable(void);
 
 int alloc_page_type(struct pfn_info *page, unsigned int type);
 void free_page_type(struct pfn_info *page, unsigned int type);
+extern void invalidate_shadow_ldt(struct exec_domain *d);
+extern u32 shadow_remove_all_write_access(
+    struct domain *d, unsigned min_type, unsigned max_type,
+    unsigned long gpfn);
+extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
 
 static inline void put_page(struct pfn_info *page)
 {
@@ -166,8 +190,10 @@ static inline int get_page(struct pfn_info *page,
              unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
              unlikely(d != _domain) )                /* Wrong owner? */
         {
-            DPRINTK("Error pfn %p: ed=%p, sd=%p, caf=%08x, taf=%08x\n",
-                    page_to_pfn(page), domain, unpickle_domptr(d),
+            DPRINTK("Error pfn %p: rd=%p(%d), od=%p(%d), caf=%08x, taf=%08x\n",
+                    page_to_pfn(page), domain, (domain ? domain->id : -1),
+                    page_get_owner(page),
+                    (page_get_owner(page) ? page_get_owner(page)->id : -1),
                     x, page->u.inuse.type_info);
             return 0;
         }
@@ -184,6 +210,8 @@ static inline int get_page(struct pfn_info *page,
 
 void put_page_type(struct pfn_info *page);
 int  get_page_type(struct pfn_info *page, u32 type);
+int  get_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
+void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
 
 static inline void put_page_and_type(struct pfn_info *page)
 {
@@ -207,6 +235,22 @@ static inline int get_page_and_type(struct pfn_info *page,
     return rc;
 }
 
+static inline int mfn_is_page_table(unsigned long mfn)
+{
+    if ( !pfn_is_ram(mfn) )
+        return 0;
+
+    return frame_table[mfn].count_info & PGC_page_table;
+}
+
+static inline int page_is_page_table(struct pfn_info *page)
+{
+    if ( !pfn_is_ram(page_to_pfn(page)) )
+        return 0;
+
+    return page->count_info & PGC_page_table;
+}
+
 #define ASSERT_PAGE_IS_TYPE(_p, _t)                            \
     ASSERT(((_p)->u.inuse.type_info & PGT_type_mask) == (_t)); \
     ASSERT(((_p)->u.inuse.type_info & PGT_count_mask) != 0)
@@ -307,6 +351,7 @@ void ptwr_flush(const int);
 int ptwr_do_page_fault(unsigned long);
 
 int new_guest_cr3(unsigned long pfn);
+void propagate_page_fault(unsigned long addr, u16 error_code);
 
 #define __cleanup_writable_pagetable(_what)                                 \
 do {                                                                        \
@@ -326,14 +371,24 @@ do {                                                                        \
                                      PTWR_CLEANUP_INACTIVE);              \
     } while ( 0 )
 
+int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
+
 #ifndef NDEBUG
-void audit_domain(struct domain *d);
+
+#define AUDIT_ALREADY_LOCKED ( 1u << 0 )
+#define AUDIT_ERRORS_OK      ( 1u << 1 )
+#define AUDIT_QUIET          ( 1u << 2 )
+
+void _audit_domain(struct domain *d, int flags, const char *file, int line);
+#define audit_domain(_d) _audit_domain((_d), 0, __FILE__, __LINE__)
 void audit_domains(void);
+
 #else
+
+#define _audit_domain(_d, _f, _file, _line) ((void)0)
 #define audit_domain(_d) ((void)0)
 #define audit_domains()  ((void)0)
-#endif
 
-void propagate_page_fault(unsigned long addr, u16 error_code);
+#endif
 
 #endif /* __ASM_X86_MM_H__ */
index 1dad979f2e6279c246e849cc20b6d81ba139f63b..660ba4cfd8a8b3ff34a41127e8e98e4ec2f44b79 100644 (file)
@@ -57,9 +57,11 @@ typedef struct { unsigned long pt_lo; } pagetable_t;
 #include <asm/flushtlb.h>
 
 #define linear_pg_table ((l1_pgentry_t *)LINEAR_PT_VIRT_START)
-#define linear_l2_table ((l2_pgentry_t *)(LINEAR_PT_VIRT_START+(LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
+#define __linear_l2_table ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \
+     (LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
+#define linear_l2_table(_ed) ((_ed)->arch.guest_vtable)
 
-#define va_to_l1mfn(_va) (l2_pgentry_val(linear_l2_table[_va>>L2_PAGETABLE_SHIFT]) >> PAGE_SHIFT)
+#define va_to_l1mfn(_ed, _va) (l2_pgentry_val(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]) >> PAGE_SHIFT)
 
 extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
 
index 2e364eacc891464bcd12b695e86bd08f771261ec..89c09d004f62adebf4e08cef7d2154dc9b3f4e56 100644 (file)
@@ -1,3 +1,22 @@
+/******************************************************************************
+ * include/asm-x86/shadow.h
+ * 
+ * Copyright (c) 2005 Michael A Fetterman
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
 
 #ifndef _XEN_SHADOW_H
 #define _XEN_SHADOW_H
 #include <asm/processor.h>
 #include <asm/domain_page.h>
 
-/* Shadow PT flag bits in shadow_status */
-#define PSH_shadowed    (1<<31) /* page has a shadow. PFN points to shadow */
-#define PSH_hl2         (1<<30) /* page is an hl2 */
-#define PSH_pfn_mask    ((1<<21)-1)
+/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
 
-/* Shadow PT operation mode: shadow-mode variable in arch_domain. */
 #define SHM_enable    (1<<0) /* we're in one of the shadow modes */
 #define SHM_log_dirty (1<<1) /* enable log dirty mode */
-#define SHM_translate (1<<2) /* do p2m translation on guest tables */
+#define SHM_translate (1<<2) /* do p2m tranaltion on guest tables */
 #define SHM_external  (1<<3) /* external page table, not used by Xen */
 
 #define shadow_mode_enabled(_d)   ((_d)->arch.shadow_mode)
 #define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty)
 #define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate)
-#ifndef __x86_64__ /* XXX Currently breaks the 64-bit build. */
 #define shadow_mode_external(_d)  ((_d)->arch.shadow_mode & SHM_external)
-#else
-#define shadow_mode_external(_d)  (0)
-#endif
 
 #define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
-#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
+#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
      (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
+#define shadow_linear_l2_table(_ed) ((_ed)->arch.shadow_vtable)
+
+// easy access to the hl2 table (for translated but not external modes only)
+#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \
+     (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
 
 #define shadow_lock_init(_d) spin_lock_init(&(_d)->arch.shadow_lock)
 #define shadow_lock(_d)      spin_lock(&(_d)->arch.shadow_lock)
 extern void shadow_mode_init(void);
 extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc);
 extern int shadow_fault(unsigned long va, struct xen_regs *regs);
-extern void shadow_l1_normal_pt_update(
-    unsigned long pa, unsigned long gpte, 
-    unsigned long *prev_spfn_ptr, l1_pgentry_t **prev_spl1e_ptr);
-extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde);
-extern void unshadow_table(unsigned long gpfn, unsigned int type);
 extern int shadow_mode_enable(struct domain *p, unsigned int mode);
-extern void free_shadow_state(struct domain *d);
 extern void shadow_invlpg(struct exec_domain *, unsigned long);
-extern unsigned long mk_hl2_table(struct exec_domain *ed);
+extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync(
+    struct exec_domain *ed, unsigned long gpfn, unsigned long mfn);
+extern void free_monitor_pagetable(struct exec_domain *ed);
+extern void __shadow_sync_all(struct domain *d);
+extern int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va);
+
+static inline unsigned long __shadow_status(
+    struct domain *d, unsigned long gpfn, unsigned long stype);
 
 extern void vmx_shadow_clear_state(struct domain *);
 
+/************************************************************************/
+
+static void inline
+__shadow_sync_mfn(struct domain *d, unsigned long mfn)
+{
+    if ( d->arch.out_of_sync )
+    {
+        // XXX - could be smarter
+        //
+        __shadow_sync_all(d);
+    }
+}
+
+static void inline
+__shadow_sync_va(struct exec_domain *ed, unsigned long va)
+{
+    struct domain *d = ed->domain;
+
+    if ( d->arch.out_of_sync && __shadow_out_of_sync(ed, va) )
+    {
+        // XXX - could be smarter
+        //
+        __shadow_sync_all(ed->domain);
+    }
+}
+
+static void inline
+shadow_sync_all(struct domain *d)
+{
+    if ( unlikely(shadow_mode_enabled(d)) )
+    {
+        shadow_lock(d);
+
+        if ( d->arch.out_of_sync )
+            __shadow_sync_all(d);
+
+        ASSERT(d->arch.out_of_sync == NULL);
+
+        shadow_unlock(d);
+    }
+}
+
+// SMP BUG: This routine can't ever be used properly in an SMP context.
+//          It should be something like get_shadow_and_sync_va().
+//          This probably shouldn't exist.
+//
+static void inline
+shadow_sync_va(struct exec_domain *ed, unsigned long gva)
+{
+    struct domain *d = ed->domain;
+    if ( unlikely(shadow_mode_enabled(d)) )
+    {
+        shadow_lock(d);
+        __shadow_sync_va(ed, gva);
+        shadow_unlock(d);
+    }
+}
+
+extern void __shadow_mode_disable(struct domain *d);
+static inline void shadow_mode_disable(struct domain *d)
+{
+    if ( shadow_mode_enabled(d) )
+        __shadow_mode_disable(d);
+}
+
+/************************************************************************/
+
 #define __mfn_to_gpfn(_d, mfn)                         \
     ( (shadow_mode_translate(_d))                      \
       ? machine_to_phys_mapping[(mfn)]                 \
@@ -61,39 +145,41 @@ extern void vmx_shadow_clear_state(struct domain *);
       ? phys_to_machine_mapping(gpfn)                  \
       : (gpfn) )
 
-extern void __shadow_mode_disable(struct domain *d);
-static inline void shadow_mode_disable(struct domain *d)
-{
-    if ( shadow_mode_enabled(d) )
-        __shadow_mode_disable(d);
-}
+/************************************************************************/
 
-extern unsigned long shadow_l2_table( 
-    struct domain *d, unsigned long gmfn);
-  
-static inline void shadow_invalidate(struct exec_domain *ed) {
-    if ( !VMX_DOMAIN(ed) )
-        BUG();
-    memset(ed->arch.shadow_vtable, 0, PAGE_SIZE);
-}
+struct shadow_status {
+    unsigned long gpfn_and_flags; /* Guest pfn plus flags. */
+    struct shadow_status *next;   /* Pull-to-front list.   */
+    unsigned long smfn;           /* Shadow mfn.           */
+};
+
+#define shadow_ht_extra_size 128
+#define shadow_ht_buckets    256
+
+struct out_of_sync_entry {
+    struct out_of_sync_entry *next;
+    unsigned long gpfn;    /* why is this here? */
+    unsigned long gmfn;
+    unsigned long snapshot_mfn;
+    unsigned long writable_pl1e; /* NB: this is a machine address */
+};
+
+#define out_of_sync_extra_size 127
+
+#define SHADOW_SNAPSHOT_ELSEWHERE (-1L)
+
+/************************************************************************/
 
 #define SHADOW_DEBUG 0
 #define SHADOW_VERBOSE_DEBUG 0
+#define SHADOW_VVERBOSE_DEBUG 0
 #define SHADOW_HASH_DEBUG 0
+#define FULLSHADOW_DEBUG 0
 
 #if SHADOW_DEBUG
 extern int shadow_status_noswap;
 #endif
 
-struct shadow_status {
-    unsigned long pfn;            /* Guest pfn.             */
-    unsigned long smfn_and_flags; /* Shadow mfn plus flags. */
-    struct shadow_status *next;   /* Pull-to-front list.    */
-};
-
-#define shadow_ht_extra_size 128
-#define shadow_ht_buckets    256
-
 #ifdef VERBOSE
 #define SH_LOG(_f, _a...)                                               \
     printk("DOM%uP%u: SH_LOG(%d): " _f "\n",                            \
@@ -102,7 +188,7 @@ struct shadow_status {
 #define SH_LOG(_f, _a...) 
 #endif
 
-#if SHADOW_DEBUG
+#if SHADOW_VERBOSE_DEBUG
 #define SH_VLOG(_f, _a...)                                              \
     printk("DOM%uP%u: SH_VLOG(%d): " _f "\n",                           \
            current->domain->id, current->processor, __LINE__ , ## _a )
@@ -110,7 +196,7 @@ struct shadow_status {
 #define SH_VLOG(_f, _a...) 
 #endif
 
-#if SHADOW_VERBOSE_DEBUG
+#if SHADOW_VVERBOSE_DEBUG
 #define SH_VVLOG(_f, _a...)                                             \
     printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n",                          \
            current->domain->id, current->processor, __LINE__ , ## _a )
@@ -118,60 +204,148 @@ struct shadow_status {
 #define SH_VVLOG(_f, _a...)
 #endif
 
-// BUG: mafetter: this assumes ed == current, so why pass ed?
-static inline void __shadow_get_l2e(
-    struct exec_domain *ed, unsigned long va, unsigned long *sl2e)
+#if FULLSHADOW_DEBUG
+#define FSH_LOG(_f, _a...)                                              \
+    printk("DOM%uP%u: FSH_LOG(%d): " _f "\n",                           \
+           current->domain->id, current->processor, __LINE__ , ## _a )
+#else
+#define FSH_LOG(_f, _a...) 
+#endif
+
+
+/************************************************************************/
+
+static inline void
+__shadow_get_l2e(
+    struct exec_domain *ed, unsigned long va, unsigned long *psl2e)
 {
-    if ( !likely(shadow_mode_enabled(ed->domain)) )
-        BUG();
+    ASSERT(shadow_mode_enabled(ed->domain));
 
-    if ( shadow_mode_translate(ed->domain) )
-        *sl2e = l2_pgentry_val(
-            ed->arch.shadow_vtable[l2_table_offset(va)]);       
-    else 
-        *sl2e = l2_pgentry_val(
-            shadow_linear_l2_table[l2_table_offset(va)]);
+    *psl2e = l2_pgentry_val( ed->arch.shadow_vtable[l2_table_offset(va)]);
 }
 
-static inline void __shadow_set_l2e(
+static inline void
+__shadow_set_l2e(
     struct exec_domain *ed, unsigned long va, unsigned long value)
 {
-    if ( !likely(shadow_mode_enabled(ed->domain)) )
-        BUG();
+    ASSERT(shadow_mode_enabled(ed->domain));
 
-    if ( shadow_mode_translate(ed->domain) ) 
-        ed->arch.shadow_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
-    else 
-        shadow_linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value);
+    ed->arch.shadow_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
 }
 
-static inline void __guest_get_l2e(
-    struct exec_domain *ed, unsigned long va, unsigned long *l2e)
+static inline void
+__guest_get_l2e(
+    struct exec_domain *ed, unsigned long va, unsigned long *pl2e)
 {
-    *l2e = ( shadow_mode_translate(ed->domain) ) ?
-        l2_pgentry_val(ed->arch.guest_vtable[l2_table_offset(va)]) :
-        l2_pgentry_val(linear_l2_table[l2_table_offset(va)]);
+    *pl2e = l2_pgentry_val(ed->arch.guest_vtable[l2_table_offset(va)]);
 }
 
-static inline void __guest_set_l2e(
+static inline void
+__guest_set_l2e(
     struct exec_domain *ed, unsigned long va, unsigned long value)
 {
-    if ( shadow_mode_translate(ed->domain) )
+    if ( unlikely(shadow_mode_translate(ed->domain)) )
     {
-        unsigned long pfn;
+        unsigned long mfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
+        unsigned long old_hl2e =
+            l1_pgentry_val(ed->arch.hl2_vtable[l2_table_offset(va)]);
+        unsigned long new_hl2e =
+            (mfn ? ((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR) : 0);
+
+        // only do the ref counting if something important changed.
+        //
+        if ( (old_hl2e ^ new_hl2e) & (PAGE_MASK | _PAGE_PRESENT) )
+        {
+            if ( new_hl2e & _PAGE_PRESENT )
+                get_page_from_l1e(mk_l1_pgentry(new_hl2e), ed->domain);
+            if ( old_hl2e & _PAGE_PRESENT )
+                put_page_from_l1e(mk_l1_pgentry(old_hl2e), ed->domain);
+        }
+
+        ed->arch.hl2_vtable[l2_table_offset(va)] = mk_l1_pgentry(new_hl2e);
+    }
+
+    ed->arch.guest_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
+}
+
+/************************************************************************/
+
+/*
+ * Add another shadow reference to smfn.
+ */
+static inline int
+get_shadow_ref(unsigned long smfn)
+{
+    u32 x, nx;
+
+    ASSERT(pfn_is_ram(smfn));
 
-        pfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
-        ed->arch.hl2_vtable[l2_table_offset(va)] =
-            mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+    x = frame_table[smfn].count_info;
+    nx = x + 1;
 
-        ed->arch.guest_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
+    if ( unlikely(nx == 0) )
+    {
+        printk("get_shadow_ref overflow, gmfn=%p smfn=%p\n",
+               frame_table[smfn].u.inuse.type_info & PGT_mfn_mask, smfn);
+        BUG();
     }
-    else
+    
+    // Guarded by the shadow lock...
+    //
+    frame_table[smfn].count_info = nx;
+
+    return 1;
+}
+
+extern void free_shadow_page(unsigned long smfn);
+
+/*
+ * Drop a shadow reference to smfn.
+ */
+static inline void
+put_shadow_ref(unsigned long smfn)
+{
+    u32 x, nx;
+
+    ASSERT(pfn_is_ram(smfn));
+
+    x = frame_table[smfn].count_info;
+    nx = x - 1;
+
+    if ( unlikely(x == 0) )
     {
-        linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value);
+        printk("put_shadow_ref underflow, gmfn=%p smfn=%p\n",
+               frame_table[smfn].u.inuse.type_info & PGT_mfn_mask, smfn);
+        BUG();
     }
+
+    // Guarded by the shadow lock...
+    //
+    frame_table[smfn].count_info = nx;
+
+    if ( unlikely(nx == 0) )
+    {
+        free_shadow_page(smfn);
+    }
+}
+
+static inline void
+shadow_pin(unsigned long smfn)
+{
+    ASSERT( !(frame_table[smfn].u.inuse.type_info & PGT_pinned) );
+
+    frame_table[smfn].u.inuse.type_info |= PGT_pinned;
+    get_shadow_ref(smfn);
 }
 
+static inline void
+shadow_unpin(unsigned long smfn)
+{
+    frame_table[smfn].u.inuse.type_info &= ~PGT_pinned;
+    put_shadow_ref(smfn);
+}
+
+
 /************************************************************************/
 
 static inline int __mark_dirty(struct domain *d, unsigned int mfn)
@@ -182,7 +356,7 @@ static inline int __mark_dirty(struct domain *d, unsigned int mfn)
     ASSERT(spin_is_locked(&d->arch.shadow_lock));
     ASSERT(d->arch.shadow_dirty_bitmap != NULL);
 
-    pfn = machine_to_phys_mapping[mfn];
+    pfn = __mfn_to_gpfn(d, mfn);
 
     /*
      * Values with the MSB set denote MFNs that aren't really part of the 
@@ -229,23 +403,41 @@ static inline int mark_dirty(struct domain *d, unsigned int mfn)
 
 /************************************************************************/
 
+extern void shadow_mark_out_of_sync(
+    struct exec_domain *ed, unsigned long gpfn, unsigned long mfn,
+    unsigned long va);
+
 static inline void l1pte_write_fault(
-    struct domain *d, unsigned long *gpte_p, unsigned long *spte_p)
-{ 
+    struct exec_domain *ed, unsigned long *gpte_p, unsigned long *spte_p,
+    unsigned long va)
+{
+    struct domain *d = ed->domain;
     unsigned long gpte = *gpte_p;
-    unsigned long spte = *spte_p;
-    unsigned long pfn = gpte >> PAGE_SHIFT;
-    unsigned long mfn = __gpfn_to_mfn(d, pfn);
+    unsigned long spte;
+    unsigned long gpfn = gpte >> PAGE_SHIFT;
+    unsigned long mfn = __gpfn_to_mfn(d, gpfn);
+
+    //printk("l1pte_write_fault gmfn=%p\n", mfn);
+
+    if ( unlikely(!mfn) )
+    {
+        SH_LOG("l1pte_write_fault: invalid gpfn=%p", gpfn);
+        *spte_p = 0;
+        return;
+    }
 
     ASSERT(gpte & _PAGE_RW);
     gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
+    spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
+
+    SH_VVLOG("l1pte_write_fault: updating spte=0x%p gpte=0x%p", spte, gpte);
 
     if ( shadow_mode_log_dirty(d) )
-        __mark_dirty(d, pfn);
+        __mark_dirty(d, mfn);
 
-    spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
+    if ( mfn_is_page_table(mfn) )
+        shadow_mark_out_of_sync(ed, gpfn, mfn, va);
 
-    SH_VVLOG("l1pte_write_fault: updating spte=0x%p gpte=0x%p", spte, gpte);
     *gpte_p = gpte;
     *spte_p = spte;
 }
@@ -258,11 +450,21 @@ static inline void l1pte_read_fault(
     unsigned long pfn = gpte >> PAGE_SHIFT;
     unsigned long mfn = __gpfn_to_mfn(d, pfn);
 
+    if ( unlikely(!mfn) )
+    {
+        SH_LOG("l1pte_read_fault: invalid gpfn=%p", pfn);
+        *spte_p = 0;
+        return;
+    }
+
     gpte |= _PAGE_ACCESSED;
     spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
 
-    if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) )
+    if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) ||
+         mfn_is_page_table(mfn) )
+    {
         spte &= ~_PAGE_RW;
+    }
 
     SH_VVLOG("l1pte_read_fault: updating spte=0x%p gpte=0x%p", spte, gpte);
     *gpte_p = gpte;
@@ -270,9 +472,8 @@ static inline void l1pte_read_fault(
 }
 
 static inline void l1pte_propagate_from_guest(
-    struct domain *d, unsigned long *gpte_p, unsigned long *spte_p)
+    struct domain *d, unsigned long gpte, unsigned long *spte_p)
 { 
-    unsigned long gpte = *gpte_p;
     unsigned long spte = *spte_p;
     unsigned long pfn = gpte >> PAGE_SHIFT;
     unsigned long mfn = __gpfn_to_mfn(d, pfn);
@@ -281,33 +482,36 @@ static inline void l1pte_propagate_from_guest(
     unsigned long old_spte = spte;
 #endif
 
-    /* Use 1:1 page table to identify MMIO address space */
-    if ( shadow_mode_external(d) && mmio_space(gpte) ) {
+    if ( unlikely(!mfn) )
+    {
+        // likely an MMIO address space mapping...
+        //
         *spte_p = 0;
         return;
     }
-    
+
     spte = 0;
     if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
          (_PAGE_PRESENT|_PAGE_ACCESSED) ) {
         
         spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
         
-        if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) )
+        if ( shadow_mode_log_dirty(d) ||
+             !(gpte & _PAGE_DIRTY) ||
+             mfn_is_page_table(mfn) )
+        {
             spte &= ~_PAGE_RW;
+        }
     }
-        
+
 #if SHADOW_VERBOSE_DEBUG
     if ( old_spte || spte || gpte )
-        SH_VVLOG("l1pte_propagate_from_guest: gpte=0x%p, old spte=0x%p, new spte=0x%p ", gpte, old_spte, spte);
+        debugtrace_printk("l1pte_propagate_from_guest: gpte=0x%p, old spte=0x%p, new spte=0x%p\n", gpte, old_spte, spte);
 #endif
 
-    *gpte_p = gpte;
     *spte_p = spte;
 }
 
-
-
 static inline void l2pde_general(
     struct domain *d,
     unsigned long *gpde_p,
@@ -315,33 +519,104 @@ static inline void l2pde_general(
     unsigned long sl1mfn)
 {
     unsigned long gpde = *gpde_p;
-    unsigned long spde = *spde_p;
+    unsigned long spde;
 
     spde = 0;
-
-    if ( sl1mfn != 0 )
+    if ( (gpde & _PAGE_PRESENT) && (sl1mfn != 0) )
     {
         spde = (gpde & ~PAGE_MASK) | (sl1mfn << PAGE_SHIFT) | 
             _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
         gpde |= _PAGE_ACCESSED; /* N.B. PDEs do not have a dirty bit. */
 
-        /* Detect linear p.t. mappings and write-protect them. */
-        if ( (frame_table[sl1mfn].u.inuse.type_info & PGT_type_mask) ==
-             PGT_l2_page_table ) 
-        {
-            if ( !shadow_mode_translate(d) )
-                spde = gpde & ~_PAGE_RW;
-
-        }
+        // XXX mafetter: Hmm...
+        //     Shouldn't the dirty log be checked/updated here?
+        //     Actually, it needs to be done in this function's callers.
+        //
+        *gpde_p = gpde;
     }
 
-    *gpde_p = gpde;
     *spde_p = spde;
 }
 
+static inline void l2pde_propagate_from_guest(
+    struct domain *d, unsigned long *gpde_p, unsigned long *spde_p)
+{
+    unsigned long gpde = *gpde_p, sl1mfn;
+
+    sl1mfn =  __shadow_status(d, gpde >> PAGE_SHIFT, PGT_l1_shadow);
+    l2pde_general(d, gpde_p, spde_p, sl1mfn);
+}
+    
+/************************************************************************/
+
+// returns true if a tlb flush is needed
+//
+static int inline
+validate_pte_change(
+    struct domain *d,
+    unsigned long new_pte,
+    unsigned long *shadow_pte_p)
+{
+    unsigned long old_spte, new_spte;
+
+    perfc_incrc(validate_pte_change);
+
+#if 0
+    FSH_LOG("validate_pte(old=%p new=%p)\n", old_pte, new_pte);
+#endif
+
+    old_spte = *shadow_pte_p;
+    l1pte_propagate_from_guest(d, new_pte, shadow_pte_p);
+    new_spte = *shadow_pte_p;
+
+    // only do the ref counting if something important changed.
+    //
+    if ( (old_spte ^ new_spte) & (PAGE_MASK | _PAGE_RW | _PAGE_PRESENT) )
+    {
+        if ( new_spte & _PAGE_PRESENT )
+            get_page_from_l1e(mk_l1_pgentry(new_spte), d);
+        if ( old_spte & _PAGE_PRESENT )
+            put_page_from_l1e(mk_l1_pgentry(old_spte), d);
+    }
+
+    // paranoia rules!
+    return 1;
+}
+
+// returns true if a tlb flush is needed
+//
+static int inline
+validate_pde_change(
+    struct domain *d,
+    unsigned long new_pde,
+    unsigned long *shadow_pde_p)
+{
+    unsigned long old_spde = *shadow_pde_p;
+    unsigned long new_spde;
+
+    perfc_incrc(validate_pde_change);
+
+    l2pde_propagate_from_guest(d, &new_pde, shadow_pde_p);
+    new_spde = *shadow_pde_p;
+
+    // only do the ref counting if something important changed.
+    //
+    if ( (old_spde ^ new_spde) & (PAGE_MASK | _PAGE_PRESENT) )
+    {
+        if ( new_spde & _PAGE_PRESENT )
+            get_shadow_ref(new_spde >> PAGE_SHIFT);
+        if ( old_spde & _PAGE_PRESENT )
+            put_shadow_ref(old_spde >> PAGE_SHIFT);
+    }
+
+    // paranoia rules!
+    return 1;
+}
+
 /*********************************************************************/
 
 #if SHADOW_HASH_DEBUG
+
 static void shadow_audit(struct domain *d, int print)
 {
     int live = 0, free = 0, j = 0, abs;
@@ -350,26 +625,25 @@ static void shadow_audit(struct domain *d, int print)
     for ( j = 0; j < shadow_ht_buckets; j++ )
     {
         a = &d->arch.shadow_ht[j];        
-        if ( a->pfn )
+        if ( a->gpfn_and_flags )
         {
             live++;
-            ASSERT(a->smfn_and_flags & PSH_pfn_mask);
+            ASSERT(a->smfn);
         }
         else
             ASSERT(!a->next);
-        ASSERT( (a->pfn & ~PSH_hl2) < 0x00100000UL);
+
         a = a->next;
         while ( a && (live < 9999) )
         { 
             live++; 
-            if ( (a->pfn == 0) || (a->smfn_and_flags == 0) )
+            if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) )
             {
-                printk("XXX live=%d pfn=%p sp=%p next=%p\n",
-                       live, a->pfn, a->smfn_and_flags, a->next);
+                printk("XXX live=%d gpfn+flags=%p sp=%p next=%p\n",
+                       live, a->gpfn_and_flags, a->smfn, a->next);
                 BUG();
             }
-            ASSERT( (a->pfn & ~PSH_hl2) < 0x00100000UL);
-            ASSERT(a->smfn_and_flags & PSH_pfn_mask);
+            ASSERT(a->smfn);
             a = a->next; 
         }
         ASSERT(live < 9999);
@@ -379,21 +653,26 @@ static void shadow_audit(struct domain *d, int print)
         free++; 
 
     if ( print )
-        printk("Xlive=%d free=%d\n",live,free);
+        printk("Xlive=%d free=%d\n", live, free);
 
     // BUG: this only works if there's only a single domain which is
     //      using shadow tables.
     //
-    abs = ( perfc_value(shadow_l1_pages) +
-            perfc_value(shadow_l2_pages) +
-            perfc_value(hl2_table_pages) ) - live;
+    abs = (
+        perfc_value(shadow_l1_pages) +
+        perfc_value(shadow_l2_pages) +
+        perfc_value(hl2_table_pages) +
+        perfc_value(snapshot_pages)
+        ) - live;
 #ifdef PERF_COUNTERS
     if ( (abs < -1) || (abs > 1) )
     {
-        printk("live=%d free=%d l1=%d l2=%d hl2=%d\n", live, free,
+        printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d\n",
+               live, free,
                perfc_value(shadow_l1_pages),
                perfc_value(shadow_l2_pages),
-               perfc_value(hl2_table_pages));
+               perfc_value(hl2_table_pages),
+               perfc_value(snapshot_pages));
         BUG();
     }
 #endif
@@ -414,30 +693,36 @@ static inline struct shadow_status *hash_bucket(
  * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace,
  *      which, depending on full shadow mode, may or may not equal
  *      its mfn).
- *      The shadow status it returns is a mfn.
+ *      It returns the shadow's mfn, or zero if it doesn't exist.
  */
+
 static inline unsigned long __shadow_status(
-    struct domain *d, unsigned int gpfn)
+    struct domain *d, unsigned long gpfn, unsigned long stype)
 {
     struct shadow_status *p, *x, *head;
+    unsigned long key = gpfn | stype;
 
     ASSERT(spin_is_locked(&d->arch.shadow_lock));
+    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
+    ASSERT(stype && !(stype & ~PGT_type_mask));
+
+    perfc_incrc(shadow_status_calls);
 
     x = head = hash_bucket(d, gpfn);
     p = NULL;
 
-    //SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, x);
+    //SH_VVLOG("lookup gpfn=%08x type=%08x bucket=%p", gpfn, stype, x);
     shadow_audit(d, 0);
 
     do
     {
-        ASSERT(x->pfn || ((x == head) && (x->next == NULL)));
+        ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL)));
 
-        if ( x->pfn == gpfn )
+        if ( x->gpfn_and_flags == key )
         {
 #if SHADOW_DEBUG
             if ( unlikely(shadow_status_noswap) )
-                return x->smfn_and_flags;
+                return x->smfn;
 #endif
             /* Pull-to-front if 'x' isn't already the head item. */
             if ( unlikely(x != head) )
@@ -448,13 +733,16 @@ static inline unsigned long __shadow_status(
                 head->next = x;
 
                 /* Swap 'x' contents with head contents. */
-                SWAP(head->pfn, x->pfn);
-                SWAP(head->smfn_and_flags, x->smfn_and_flags);
+                SWAP(head->gpfn_and_flags, x->gpfn_and_flags);
+                SWAP(head->smfn, x->smfn);
+            }
+            else
+            {
+                perfc_incrc(shadow_status_hit_head);
             }
 
-            SH_VVLOG("lookup gpfn=%p => status=%p",
-                     gpfn, head->smfn_and_flags);
-            return head->smfn_and_flags;
+            SH_VVLOG("lookup gpfn=%p => status=%p", key, head->smfn);
+            return head->smfn;
         }
 
         p = x;
@@ -462,17 +750,68 @@ static inline unsigned long __shadow_status(
     }
     while ( x != NULL );
 
-    SH_VVLOG("lookup gpfn=%p => status=0", gpfn);
+    SH_VVLOG("lookup gpfn=%p => status=0", key);
+    perfc_incrc(shadow_status_miss);
     return 0;
 }
 
+/*
+ * Not clear if pull-to-front is worth while for this or not,
+ * as it generally needs to scan the entire bucket anyway.
+ * Much simpler without.
+ *
+ * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table.
+ */
+static inline unsigned long
+shadow_max_pgtable_type(struct domain *d, unsigned long gpfn)
+{
+    struct shadow_status *x;
+    unsigned long pttype = PGT_none, type;
+
+    ASSERT(spin_is_locked(&d->arch.shadow_lock));
+    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
+
+    x = hash_bucket(d, gpfn);
+
+    while ( x && x->gpfn_and_flags )
+    {
+        if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn )
+        {
+            type = x->gpfn_and_flags & PGT_type_mask;
+
+            // Treat an HL2 as if it's an L1
+            //
+            if ( type == PGT_hl2_shadow )
+                type = PGT_l1_shadow;
+
+            // Ignore snapshots -- they don't in and of themselves constitute
+            // treating a page as a page table
+            //
+            if ( type == PGT_snapshot )
+                goto next;
+
+            // Early exit if we found the max possible value
+            //
+            if ( type == PGT_base_page_table )
+                return type;
+
+            if ( type > pttype )
+                pttype = type;
+        }
+    next:
+        x = x->next;
+    }
+
+    return pttype;
+}
+
 /*
  * N.B. We can make this locking more fine grained (e.g., per shadow page) if
  * it ever becomes a problem, but since we need a spin lock on the hash table 
  * anyway it's probably not worth being too clever.
  */
 static inline unsigned long get_shadow_status(
-    struct domain *d, unsigned int gpfn )
+    struct domain *d, unsigned long gpfn, unsigned long stype)
 {
     unsigned long res;
 
@@ -484,65 +823,66 @@ static inline unsigned long get_shadow_status(
      * has changed type. If we're in log dirty mode, we should set the
      * appropriate bit in the dirty bitmap.
      * N.B. The VA update path doesn't use this and is handled independently. 
-
-     XXX need to think this through for vmx guests, but probably OK
+     *
+     XXX need to think this through for vmx guests, but probably OK
      */
 
     shadow_lock(d);
 
     if ( shadow_mode_log_dirty(d) )
-        __mark_dirty(d, gpfn);
+        __mark_dirty(d, __gpfn_to_mfn(d, gpfn));
 
-    if ( !(res = __shadow_status(d, gpfn)) )
+    if ( !(res = __shadow_status(d, gpfn, stype)) )
         shadow_unlock(d);
 
     return res;
 }
 
 
-static inline void put_shadow_status(
-    struct domain *d)
+static inline void put_shadow_status(struct domain *d)
 {
     shadow_unlock(d);
 }
 
 
 static inline void delete_shadow_status( 
-    struct domain *d, unsigned int gpfn)
+    struct domain *d, unsigned int gpfn, unsigned int stype)
 {
     struct shadow_status *p, *x, *n, *head;
+    unsigned long key = gpfn | stype;
 
     ASSERT(spin_is_locked(&d->arch.shadow_lock));
-    ASSERT(gpfn != 0);
+    ASSERT(gpfn && !(gpfn & ~PGT_mfn_mask));
+    ASSERT(stype && !(stype & ~PGT_type_mask));
 
     head = hash_bucket(d, gpfn);
 
-    SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, head);
+    SH_VLOG("delete gpfn=%p t=%p bucket=%p", gpfn, stype, head);
     shadow_audit(d, 0);
 
     /* Match on head item? */
-    if ( head->pfn == gpfn )
+    if ( head->gpfn_and_flags == key )
     {
         if ( (n = head->next) != NULL )
         {
             /* Overwrite head with contents of following node. */
-            head->pfn            = n->pfn;
-            head->smfn_and_flags = n->smfn_and_flags;
+            head->gpfn_and_flags = n->gpfn_and_flags;
+            head->smfn           = n->smfn;
 
             /* Delete following node. */
             head->next           = n->next;
 
             /* Add deleted node to the free list. */
-            n->pfn            = 0;
-            n->smfn_and_flags = 0;
+            n->gpfn_and_flags = 0;
+            n->smfn           = 0;
             n->next           = d->arch.shadow_ht_free;
             d->arch.shadow_ht_free = n;
         }
         else
         {
             /* This bucket is now empty. Initialise the head node. */
-            head->pfn            = 0;
-            head->smfn_and_flags = 0;
+            head->gpfn_and_flags = 0;
+            head->smfn           = 0;
         }
 
         goto found;
@@ -553,14 +893,14 @@ static inline void delete_shadow_status(
 
     do
     {
-        if ( x->pfn == gpfn )
+        if ( x->gpfn_and_flags == key )
         {
             /* Delete matching node. */
             p->next = x->next;
 
             /* Add deleted node to the free list. */
-            x->pfn            = 0;
-            x->smfn_and_flags = 0;
+            x->gpfn_and_flags = 0;
+            x->smfn           = 0;
             x->next           = d->arch.shadow_ht_free;
             d->arch.shadow_ht_free = x;
 
@@ -576,34 +916,46 @@ static inline void delete_shadow_status(
     BUG();
 
  found:
+    // release ref to page
+    put_page(pfn_to_page(__gpfn_to_mfn(d, gpfn)));
+
     shadow_audit(d, 0);
 }
 
-
 static inline void set_shadow_status(
-    struct domain *d, unsigned int gpfn, unsigned long s)
+    struct domain *d, unsigned long gpfn,
+    unsigned long smfn, unsigned long stype)
 {
     struct shadow_status *x, *head, *extra;
     int i;
+    unsigned long gmfn = __gpfn_to_mfn(d, gpfn);
+    unsigned long key = gpfn | stype;
 
     ASSERT(spin_is_locked(&d->arch.shadow_lock));
-    ASSERT(gpfn != 0);
-    ASSERT(s & (PSH_shadowed | PSH_hl2));
+    ASSERT(gpfn && !(gpfn & ~PGT_mfn_mask));
+    ASSERT(pfn_is_ram(gmfn)); // XXX need to be more graceful
+    ASSERT(smfn && !(smfn & ~PGT_mfn_mask));
+    ASSERT(stype && !(stype & ~PGT_type_mask));
 
     x = head = hash_bucket(d, gpfn);
    
-    SH_VVLOG("set gpfn=%08x s=%p bucket=%p(%p)", gpfn, s, x, x->next);
+    SH_VLOG("set gpfn=%p smfn=%p t=%p bucket=%p(%p)",
+             gpfn, smfn, stype, x, x->next);
     shadow_audit(d, 0);
 
+    // grab a reference to the guest page to represent the entry in the shadow
+    // hash table
+    //
+    get_page(pfn_to_page(gmfn), d);
+
     /*
      * STEP 1. If page is already in the table, update it in place.
      */
-
     do
     {
-        if ( x->pfn == gpfn )
+        if ( x->gpfn_and_flags == key )
         {
-            x->smfn_and_flags = s;
+            x->smfn = smfn;
             goto done;
         }
 
@@ -616,10 +968,10 @@ static inline void set_shadow_status(
      */
 
     /* If the bucket is empty then insert the new page as the head item. */
-    if ( head->pfn == 0 )
+    if ( head->gpfn_and_flags == 0 )
     {
-        head->pfn            = gpfn;
-        head->smfn_and_flags = s;
+        head->gpfn_and_flags = key;
+        head->smfn           = smfn;
         ASSERT(head->next == NULL);
         goto done;
     }
@@ -658,35 +1010,107 @@ static inline void set_shadow_status(
     d->arch.shadow_ht_free = x->next;
 
     /* Initialise the new node and insert directly after the head item. */
-    x->pfn            = gpfn;
-    x->smfn_and_flags = s;
+    x->gpfn_and_flags = key;
+    x->smfn           = smfn;
     x->next           = head->next;
     head->next        = x;
 
  done:
     shadow_audit(d, 0);
 }
-  
+
+/************************************************************************/
+
+extern void shadow_map_l1_into_current_l2(unsigned long va);
+
+void static inline
+shadow_set_l1e(unsigned long va, unsigned long new_spte, int create_l1_shadow)
+{
+    struct exec_domain *ed = current;
+    struct domain *d = ed->domain;
+    unsigned long sl2e, old_spte;
+
+#if 0
+    printk("shadow_set_l1e(va=%p, new_spte=%p, create=%d)\n",
+           va, new_spte, create_l1_shadow);
+#endif
+
+    __shadow_get_l2e(ed, va, &sl2e);
+    if ( !(sl2e & _PAGE_PRESENT) )
+    {
+        /*
+         * Either the L1 is not shadowed, or the shadow isn't linked into
+         * the current shadow L2.
+         */
+        if ( create_l1_shadow )
+        {
+            perfc_incrc(shadow_set_l1e_force_map);
+            shadow_map_l1_into_current_l2(va);
+        }
+        else /* check to see if it exists; if so, link it in */
+        {
+            unsigned long gpde =
+                l2_pgentry_val(linear_l2_table(ed)[l2_table_offset(va)]);
+            unsigned long gl1pfn = gpde >> PAGE_SHIFT;
+            unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
+
+            ASSERT( gpde & _PAGE_PRESENT );
+
+            if ( sl1mfn )
+            {
+                perfc_incrc(shadow_set_l1e_unlinked);
+                get_shadow_ref(sl1mfn);
+                l2pde_general(d, &gpde, &sl2e, sl1mfn);
+                __guest_set_l2e(ed, va, gpde);
+                __shadow_set_l2e(ed, va, sl2e);
+            }
+            else
+            {
+                // no shadow exists, so there's nothing to do.
+                perfc_incrc(shadow_set_l1e_fail);
+                return;
+            }
+        }
+    }
+
+    old_spte = l1_pgentry_val(shadow_linear_pg_table[l1_linear_offset(va)]);
+    shadow_linear_pg_table[l1_linear_offset(va)] = mk_l1_pgentry(new_spte);
+
+    // only do the ref counting if something important changed.
+    //
+    if ( (old_spte ^ new_spte) & (PAGE_MASK | _PAGE_RW | _PAGE_PRESENT) )
+    {
+        if ( new_spte & _PAGE_PRESENT )
+            get_page_from_l1e(mk_l1_pgentry(new_spte), d);
+        if ( old_spte & _PAGE_PRESENT )
+            put_page_from_l1e(mk_l1_pgentry(old_spte), d);
+    }
+}
+
+/************************************************************************/
+
 static inline unsigned long gva_to_gpte(unsigned long gva)
 {
-    unsigned long gpde, gpte, pfn, index;
+    unsigned long gpde, gpte;
     struct exec_domain *ed = current;
 
+    ASSERT( shadow_mode_translate(current->domain) );
+
     __guest_get_l2e(ed, gva, &gpde);
-    if (!(gpde & _PAGE_PRESENT))
+    if ( unlikely(!(gpde & _PAGE_PRESENT)) )
         return 0;
 
-    index = l2_table_offset(gva);
-
-    if (!l2_pgentry_val(ed->arch.hl2_vtable[index])) {
-        pfn = phys_to_machine_mapping(gpde >> PAGE_SHIFT);
-        ed->arch.hl2_vtable[index] = 
-            mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-    }
+    // This is actually overkill - we only need to make sure the hl2
+    // is in-sync.
+    //
+    shadow_sync_va(ed, gva);
 
     if ( unlikely(__get_user(gpte, (unsigned long *)
                              &linear_pg_table[gva >> PAGE_SHIFT])) )
+    {
+        FSH_LOG("gva_to_gpte got a fault on gva=%p\n", gva);
         return 0;
+    }
 
     return gpte;
 }
@@ -702,94 +1126,19 @@ static inline unsigned long gva_to_gpa(unsigned long gva)
     return (gpte & PAGE_MASK) + (gva & ~PAGE_MASK); 
 }
 
-static inline void hl2_table_invalidate(struct exec_domain *ed)
-{
-    /*
-     * Need to optimize this
-     */
-    memset(ed->arch.hl2_vtable, 0, PAGE_SIZE);
-}
-
-static inline void __update_pagetables(struct exec_domain *ed)
-{
-    struct domain *d = ed->domain;
-    unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
-    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
-    unsigned long smfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
-
-    SH_VVLOG("0: __update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
-
-    if ( unlikely(smfn == 0) )
-        smfn = shadow_l2_table(d, gmfn);
-
-    ed->arch.shadow_table = mk_pagetable(smfn<<PAGE_SHIFT);
-
-    if ( shadow_mode_translate(d) )
-    {
-        l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
-        l2_pgentry_t *gpl2e, *spl2e;
-        unsigned long hl2_status, hl2mfn, offset;
-        int need_flush = 0;
-
-        if ( ed->arch.guest_vtable )
-            unmap_domain_mem(ed->arch.guest_vtable);
-        if ( ed->arch.shadow_vtable )
-            unmap_domain_mem(ed->arch.shadow_vtable);
-        if ( ed->arch.hl2_vtable )
-            unmap_domain_mem(ed->arch.hl2_vtable);
-
-        gpl2e = ed->arch.guest_vtable =
-            map_domain_mem(pagetable_val(ed->arch.guest_table));
-        spl2e = ed->arch.shadow_vtable =
-            map_domain_mem(pagetable_val(ed->arch.shadow_table));
-
-        hl2_status = __shadow_status(d, gpfn | PSH_hl2);
-        if ( unlikely(!(hl2_status & PSH_hl2)) )
-            hl2_status = mk_hl2_table(ed);
-
-        hl2mfn = hl2_status & PSH_pfn_mask;
-        ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
-
-        offset = l2_table_offset(LINEAR_PT_VIRT_START);
-        if ( hl2mfn != (l2_pgentry_val(mpl2e[offset]) >> PAGE_SHIFT) )
-        {
-            mpl2e[offset] =
-                mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-            need_flush = 1;
-        }
-
-        if ( shadow_mode_external(d ) )
-        {
-            offset = l2_table_offset(SH_LINEAR_PT_VIRT_START);
-            if ( smfn != (l2_pgentry_val(mpl2e[offset]) >> PAGE_SHIFT) )
-            {
-                mpl2e[offset] =
-                    mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-                need_flush = 1;
-            }
-        }
-
-        if ( VMX_DOMAIN(ed) )
-        {
-            // Why is VMX mode doing this?
-            shadow_invalidate(ed);
-            hl2_table_invalidate(ed);
-        }
-
-        if ( need_flush )
-            local_flush_tlb();
-    }
-}
+/************************************************************************/
 
+extern void __update_pagetables(struct exec_domain *ed);
 static inline void update_pagetables(struct exec_domain *ed)
 {
     struct domain *d = ed->domain;
-    int paging_enabled =
+
 #ifdef CONFIG_VMX
+    int paging_enabled =
         !VMX_DOMAIN(ed) ||
         test_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state);
 #else
-        1;
+    const int paging_enabled = 1;
 #endif
 
     /*
@@ -805,12 +1154,8 @@ static inline void update_pagetables(struct exec_domain *ed)
         shadow_unlock(d);
     }
 
-    if ( !shadow_mode_external(d) )
+    if ( likely(!shadow_mode_external(d)) )
     {
-        /*
-         * Internal page tables:
-         * No need to allocate a separate page table for Xen.
-         */
 #ifdef __x86_64__
         if ( !(ed->arch.flags & TF_kernel_mode) )
             ed->arch.monitor_table = ed->arch.guest_table_user;
@@ -821,27 +1166,17 @@ static inline void update_pagetables(struct exec_domain *ed)
         else
             ed->arch.monitor_table = ed->arch.guest_table;
     }
-    else
-    {
-        /*
-         * External page tables:
-         * Allocate a monitor page table if we don't already have one.
-         */
-        if ( unlikely(!pagetable_val(ed->arch.monitor_table)) )
-            ed->arch.monitor_table =
-                mk_pagetable(alloc_monitor_pagetable(ed) << PAGE_SHIFT);
-    }
 }
 
 #if SHADOW_DEBUG
-extern int _check_pagetable(struct domain *d, pagetable_t pt, char *s);
-extern int _check_all_pagetables(struct domain *d, char *s);
+extern int _check_pagetable(struct exec_domain *ed, char *s);
+extern int _check_all_pagetables(struct exec_domain *ed, char *s);
 
-#define check_pagetable(_d, _pt, _s) _check_pagetable(_d, _pt, _s)
-//#define check_pagetable(_d, _pt, _s) _check_all_pagetables(_d, _s)
+#define check_pagetable(_ed, _s) _check_pagetable(_ed, _s)
+//#define check_pagetable(_ed, _s) _check_all_pagetables(_ed, _s)
 
 #else
-#define check_pagetable(_d, _pt, _s) ((void)0)
+#define check_pagetable(_ed, _s) ((void)0)
 #endif
 
 #endif /* XEN_SHADOW_H */
index b75df5ca281c1182817112c5a21a6604ab55641e..897ac2d0bd951d6694a0c64776f5ea959c066ca5 100644 (file)
@@ -68,7 +68,7 @@ typedef l2_pgentry_t root_pgentry_t;
 #define L1_DISALLOW_MASK (3UL << 7)
 #define L2_DISALLOW_MASK (7UL << 7)
 #define L3_DISALLOW_MASK (7UL << 7)
-#define L2_DISALLOW_MASK (7UL << 7)
+#define L4_DISALLOW_MASK (7UL << 7)
 
 #endif /* __X86_32_PAGE_H__ */
 
index 8db16e2512402ca3fceb370239f42e248c7d62f7..15db59d73d929d7f408748592a1883136437f46c 100644 (file)
@@ -27,6 +27,4 @@ extern void domain_relinquish_memory(struct domain *d);
 
 extern void dump_pageframe_info(struct domain *d);
 
-extern unsigned long alloc_monitor_pagetable(struct exec_domain *ed);
-
 #endif /* __XEN_DOMAIN_H__ */
index c67c5f46b961364e738bc587259decf603399e4c..57ecb1eec208f567662c36bbc2fde9125e2e8503 100644 (file)
@@ -48,3 +48,33 @@ PERFCOUNTER_ARRAY( exceptions, "exceptions", 32 )
 #define VMX_PERF_VECTOR_SIZE 0x20
 PERFCOUNTER_ARRAY( vmexits, "vmexits", VMX_PERF_EXIT_REASON_SIZE )
 PERFCOUNTER_ARRAY( cause_vector, "cause vector", VMX_PERF_VECTOR_SIZE )
+
+
+PERFCOUNTER_CPU( shadow_hl2_table_count,   "shadow_hl2_table count" )
+PERFCOUNTER_CPU( shadow_set_l1e_force_map, "shadow_set_l1e forced to map l1" )
+PERFCOUNTER_CPU( shadow_set_l1e_unlinked,  "shadow_set_l1e found unlinked l1" )
+PERFCOUNTER_CPU( shadow_set_l1e_fail,      "shadow_set_l1e failed (no sl1)" )
+PERFCOUNTER_CPU( shadow_invlpg_faults,     "shadow_invlpg's get_user faulted")
+
+
+/* STATUS counters do not reset when 'P' is hit */
+PERFSTATUS( snapshot_pages,  "current # fshadow snapshot pages" )
+
+PERFCOUNTER_CPU(shadow_status_calls,    "calls to __shadow_status" )
+PERFCOUNTER_CPU(shadow_status_miss,     "missed shadow cache" )
+PERFCOUNTER_CPU(shadow_status_hit_head, "hits on head of bucket" )
+
+PERFCOUNTER_CPU(shadow_sync_all,                   "calls to shadow_sync_all")
+PERFCOUNTER_CPU(shadow_make_snapshot,              "snapshots created")
+PERFCOUNTER_CPU(shadow_mark_mfn_out_of_sync_calls, "calls to shadow_mk_out_of_sync")
+PERFCOUNTER_CPU(shadow_out_of_sync_calls,          "calls to shadow_out_of_sync")
+PERFCOUNTER_CPU(snapshot_entry_matches_calls,      "calls to ss_entry_matches")
+PERFCOUNTER_CPU(snapshot_entry_matches_true,       "ss_entry_matches returns true")
+
+PERFCOUNTER_CPU(shadow_fault_calls,                "calls to shadow_fault")
+PERFCOUNTER_CPU(shadow_fault_bail_pde_not_present, "sf bailed due to pde not present")
+PERFCOUNTER_CPU(shadow_fault_bail_pte_not_present, "sf bailed due to pte not present")
+PERFCOUNTER_CPU(shadow_fault_bail_ro_mapping,      "sf bailed due to a ro mapping")
+PERFCOUNTER_CPU(shadow_fault_fixed,                "sf fixed the pgfault")
+PERFCOUNTER_CPU(validate_pte_change,               "calls to validate_pte_change")
+PERFCOUNTER_CPU(validate_pde_change,               "calls to validate_pde_change")